wurmlab/GeneValidator

View on GitHub
lib/genevalidator/clusterization.rb

Summary

Maintainability
D
2 days
Test Coverage
C
76%
# Top level module / namespace.
module GeneValidator
  Pair = Struct.new(:x, :y) do
    include Comparable

    ##
    # Overload '-' operator
    # Returns the euclidian distane between two pairs
    def -(other)
      xx = other.x
      yy = other.y
      Math.sqrt((x - xx) * (x - xx) + (y - yy) * (y - yy))
    end

    def print
      warn "Cluster: #{x} #{y}"
    end

    ##
    # Overload '+' operator
    # This will modify the current object
    def +(other)
      self.x += other.x
      self.y += other.y
    end

    ##
    # Overload '*' operator
    # This will modify the current object
    def *(other)
      self.x *= other
      self.y *= other
    end

    ##
    # Overload '/' operator
    # This will modify the current object
    def /(other)
      self.x /= other.to_f
      self.y /= other.to_f
    end

    ##
    # Overload quality operator
    # Returns true if the pairs are equal, false otherwise
    def ==(other)
      other.x == x && other.y == y ? true : false
    end

    def eql?(other)
      self == other
    end

    def hash
      [self.x, self.y].hash
    end
  end

  class PairCluster
    # a hash map containing the pair (object, no_occurences)
    attr_accessor :objects

    def initialize(objects)
      @objects = objects
    end

    def print
      objects.each do |elem|
        warn "(#{elem[0].x},#{elem[0].y}): #{elem[1]}"
      end
    end

    ##
    # Returns the weighted mean value of the cluster
    def mean
      mean = Pair.new(0, 0)
      weight = 0

      objects.each do |object, n|
        (1..n).each do |_i|
          mean + object
          weight += 1
        end
      end
      mean / weight
      mean
    end

    ##
    # Returns the density of the cluster: how many values it contains
    def density
      d = 0
      objects.each do |elem|
        d += elem[1]
      end
      d
    end

    # Returns the euclidian distance between the current cluster and the one
    # given as parameter
    # Params:
    # +cluster+: Cluster object
    # +method+: 0 or 1
    # method = 0: do not into condseideration duplicate values
    # method = 1: average linkage clusterization
    def distance(cluster, method = 0)
      d = 0
      norm = 0

      cluster.objects.each do |elem1|
        objects.each do |elem2|
          if method == 1
            d += elem1[1] * elem2[1] * (elem1[0] - elem2[0]).abs
            norm += elem1[1] * elem2[1]
          else
            d += (elem1[0] - elem2[0]).abs
            norm = cluster.objects.length * objects.length
          end
        end
      end

      # group average distance
      d /= (norm + 0.0)
    end

    ##
    # Returns within cluster sum of squares
    def wss(objects = nil)
      if objects.nil?
        objects = @objects.map { |x| Array.new(x[1], x[0]) }.flatten
      end

      cluster_mean = mean
      ss = 0
      objects.each do |object|
        ss += (cluster_mean - object) * (cluster_mean - object)
      end
      ss
    end

    ##
    # Merges the current cluster with the one given as parameter
    # +clusters+ vector of Cluster objects
    def add(cluster)
      cluster.objects.each do |elem|
        objects[elem[0]] = elem[1]
      end
    end
  end

  ##
  # Stores the values belonging to one cluster
  # Used for clusterization among a vector of values
  class Cluster
    # a hash map containing the pair (length, no_occurences)
    attr_accessor :lengths

    def initialize(lengths)
      @lengths = lengths
    end

    ##
    # Returns the weighted mean value of the cluster
    def mean
      mean_len = 0
      weight = 0

      lengths.each do |length, n|
        mean_len += length * n
        weight += n
      end
      mean_len /= weight
    end

    ##
    # Returns the density of the cluster: how many values it contains
    def density
      d = 0
      lengths.each do |elem|
        d += elem[1]
      end
      d
    end

    # Returns the euclidian distance between the current cluster and the one
    # given as parameter
    # Params:
    # +cluster+: Cluster object
    # +method+: 0 or 1
    # method = 0: do not into condseideration duplicate values
    # method = 1: average linkage clusterization
    def distance(cluster, method = 0)
      d = 0
      norm = 0

      cluster.lengths.each do |elem1|
        lengths.each do |elem2|
          if method == 1
            d += elem1[1] * elem2[1] * (elem1[0] - elem2[0]).abs
            norm += elem1[1] * elem2[1]
          else
            d += (elem1[0] - elem2[0]).abs
            norm = cluster.lengths.length * lengths.length
          end
        end
      end

      # group average distance
      d /= (norm + 0.0)
      d.round(4)
    end

    ##
    # Returns within cluster sum of squares
    def wss(lengths = nil)
      if lengths.nil?
        lengths = @lengths.map { |x| Array.new(x[1], x[0]) }.flatten
      end

      cluster_mean = mean
      ss = 0
      lengths.each do |len|
        ss += (cluster_mean - len) * (cluster_mean - len)
      end
      ss
    end

    ##
    # Returns the standard deviation of a set of values
    # Params:
    # +lengths+: a vector of values (optional, by default it takes the values
    # in the cluster)
    # Output:
    # Real number
    def standard_deviation(lengths = nil)
      if lengths.nil?
        lengths = @lengths.map { |x| Array.new(x[1], x[0]) }.flatten
      end

      cluster_mean = mean
      std_deviation = 0
      lengths.each do |len|
        std_deviation += (cluster_mean - len) * (cluster_mean - len)
      end
      std_deviation = Math.sqrt(std_deviation.to_f / (lengths.length - 1))
    end

    ##
    # Returns the deviation of a value from the values in all clusters
    # Params:
    # +clusters+: a list of Cluster objects
    # +queryLength+: a reference Sequence object
    # Output:
    # Real number
    def deviation(clusters, queryLength)
      hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten
      raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.delete('[').delete(']')
      R.eval("sd = sd(c(#{raw_hits}))")
      sd = R.pull('sd')
      sd = standard_deviation(hits)
      (queryLength - mean).abs / sd
    end

    ##
    # Merges the current cluster with the one given as parameter
    # +clusters+ vector of Cluster objects
    def add(cluster)
      cluster.lengths.each do |elem|
        lengths[elem[0]] = elem[1]
      end
    end

    ##
    # Prints the current cluster
    def print
      warn "Cluster: mean = #{mean}, density = #{density}"
      lengths.sort { |a, b| a <=> b }.each do |elem|
        warn "#{elem[0]}, #{elem[1]}"
      end
      warn '--------------------------'
    end

    ##
    # Returns the interval limits of the current cluster
    def get_limits
      lengths.map { |elem| elem[0] }.minmax
    end

    ##
    # Returns whether the value is inside the cluster
    # Params:
    # +value+: value to compare
    # Output:
    # :ok or :shorter or :longer
    def inside_cluster(value)
      limits = get_limits
      left = limits[0]
      right = limits[1]

      :ok if left <= value && right >= value
      :shorter if left >= value
      :longer if right <= value
    end
  end

  class HierarchicalClusterization
    attr_accessor :values
    attr_accessor :clusters

    ##
    # Object initialization
    # Params:
    # +values+ :vector of values
    def initialize(values)
      @values = values
      @clusters = []
    end

    def hierarchical_clusterization_2d(no_clusters = 0, distance_method = 0,
                                       vec = @values, debug = false)
      clusters = []

      if vec.length == 1
        hash = { vec[0] => 1 }
        cluster = PairCluster.new(hash)
        clusters.push(cluster)
        clusters
      end

      # Thresholds
      # threshold_distance = (0.25 * (vec.max-vec.min))
      threshold_density = (0.5 * vec.length).to_i

      # make a histogram from the input vector
      histogram = Hash[vec.group_by { |a| a }.map { |k, vs| [k, vs.length] }]

      # clusters = array of clusters
      # initially each length belongs to a different cluster
      histogram.each do |e|
        warn "pair (#{e[0].x} #{e[0].y}) appears #{e[1]} times" if debug
        hash = { e[0] => e[1] }
        cluster = PairCluster.new(hash)
        clusters.push(cluster)
      end

      clusters.each(&:print) if debug

      return clusters if clusters.length == 1

      # each iteration merge the closest two adiacent cluster
      # the loop stops according to the stop conditions
      iteration = 0
      loop do
        # stop condition 1
        break if no_clusters != 0 && clusters.length == no_clusters

        iteration += iteration
        warn "\nIteration #{iteration}" if debug

        min_distance = 100_000_000
        cluster1     = 0
        cluster2     = 0
        density      = 0

        [*(0..(clusters.length - 2))].each do |i|
          [*((i + 1)..(clusters.length - 1))].each do |j|
            dist = clusters[i].distance(clusters[j], distance_method)
            warn "distance between clusters #{i} and #{j} is #{dist}" if debug
            current_density = clusters[i].density + clusters[j].density
            if dist < min_distance
              min_distance = dist
              cluster1     = i
              cluster2     = j
              density      = current_density
            elsif dist == min_distance && density < current_density
              cluster1 = i
              cluster2 = j
              density  = current_density
            end
          end
        end

        # merge clusters 'cluster1' and 'cluster2'
        warn "clusters to merge #{cluster1} and #{cluster2}" if debug

        clusters[cluster1].add(clusters[cluster2])
        clusters.delete_at(cluster2)

        if debug
          clusters.each_with_index do |elem, i|
            warn "cluster #{i}"
            elem.print
          end
        end

        # stop condition 3
        # the density of the biggest clusters exceeds the threshold
        if no_clusters == 0 && clusters[cluster].density > threshold_density
          break
        end
      end

      @clusters = clusters
    end

    ##
    # Makes an hierarchical clusterization until the most dense cluster is
    # obtained or the distance between clusters is sufficintly big
    # or the desired number of clusters is obtained
    # Params:
    # +no_clusters+: stop test (number of clusters)
    # +distance_method+: distance method (method 0 or method 1)
    # +vec+: a vector of values (by default the values from initialization)
    # +debug+: display debug information
    # Output:
    # vector of +Cluster+ objects
    def hierarchical_clusterization(no_clusters = 0, distance_method = 0,
                                    vec = @values, debug = false)
      clusters = []
      vec = vec.sort

      if vec.length == 1
        hash    = { vec[0] => 1 }
        cluster = Cluster.new(hash)
        clusters.push(cluster)
        clusters
      end

      # Thresholds
      threshold_distance = (0.25 * (vec.max - vec.min))
      threshold_density  = (0.5 * vec.length).to_i

      # make a histogram from the input vector
      histogram = Hash[vec.group_by { |x| x }.map { |k, vs| [k, vs.length] }]

      # clusters = array of clusters
      # initially each length belongs to a different cluster
      histogram.sort_by { |a| a[0] }.each do |elem|
        warn "len #{elem[0]} appears #{elem[1]} times" if debug
        hash = { elem[0] => elem[1] }
        cluster = Cluster.new(hash)
        clusters.push(cluster)
      end

      clusters.each(&:print) if debug

      return clusters if clusters.length == 1

      # each iteration merge the closest two adiacent cluster
      # the loop stops according to the stop conditions
      iteration = 0
      loop do
        # stop condition 1
        break if no_clusters != 0 && clusters.length == no_clusters

        iteration += iteration
        warn "\nIteration #{iteration}" if debug

        min_distance = 100_000_000
        cluster      = 0
        density      = 0

        clusters[0..clusters.length - 2].each_with_index do |_item, i|
          dist = clusters[i].distance(clusters[i + 1], distance_method)
          warn "distance btwn clusters #{i} and #{i + 1} is #{dist}" if debug
          current_density = clusters[i].density + clusters[i + 1].density
          if dist < min_distance
            min_distance = dist
            cluster = i
            density = current_density
          elsif dist == min_distance && density < current_density
            cluster = i
            density = current_density
          end
        end

        # stop condition 2
        # the distance between the closest clusters exceeds the threshold
        if no_clusters == 0 && (clusters[cluster].mean - clusters[cluster + 1].mean).abs > threshold_distance
          break
        end

        # merge clusters 'cluster' and 'cluster'+1
        warn "clusters to merge #{cluster} and #{cluster + 1}" if debug

        clusters[cluster].add(clusters[cluster + 1])
        clusters.delete_at(cluster + 1)

        if debug
          clusters.each_with_index do |elem, i|
            warn "cluster #{i}"
            elem.print
          end
        end

        # stop condition 3
        # the density of the biggest clusters exceeds the threshold
        if no_clusters == 0 && clusters[cluster].density > threshold_density
          break
        end
      end

      @clusters = clusters
    end

    ##
    # Returns the cluster with the maimum density
    # Params:
    # +clusters+: list of +Clususter+ objects
    def most_dense_cluster(clusters = @clusters)
      max_density = 0
      max_density_cluster = 0

      nil if clusters.nil?

      clusters.each_with_index do |item, i|
        if item.density > max_density
          max_density = item.density
          max_density_cluster = i
        end
      end
      clusters[max_density_cluster]
    end
  end
end