stepf/RiboPip

View on GitHub
lib/ribopip/feature_counts.rb

Summary

Maintainability
A
1 hr
Test Coverage
module Ribopip
  # FeatureCounts
  class FeatureCounts
    extend Ribopip

    # Detects key collisions in hash and push elements into an array to avoid
    # data loss
    #
    # hash - hash
    # key  - key
    # data - data
    def push_into_hash(hash, key, data)
      if hash[key.to_sym].nil?
        hash[key.to_sym] = [data]
      else # existing key
        hash[key.to_sym].push(data)
      end
    end

    # Parses GTF annotation and writes a tab-seperated file, which assigns
    # biotypes to gene_ids. This file will be used to process FeatureCounts
    # output. Writes "#{annotation}.types".
    #
    # annotation - input gtf file
    def self.write_index(annotation)
      features = {}
      names = {}
      # get 9th column of all exons
      exons_annotation = `cat #{annotation} | grep -P '\texon\t' | cut -f9`

      # build hash
      exons_annotation.each_line do |line|
        gene_id = line[/gene_id\W+\w+/].delete('"').split[1]
        name = line[/gene_name\W+\w+/].delete('"').split[1]
        name = name.nil? ? 'n/a' : name
        type = line[/gene_biotype\W+\w+/].delete('"').split[1]

        # new gene_id
        push_into_hash(features, gene_id, type)
        push_into_hash(names, gene_id, name)
      end

      # write index
      file_types = File.open("#{annotation}.types", 'w')
      features.each do |gene_id, ft|
        file_types.printf("%s\t%s\t%s\n", gene_id, ft.uniq.sort * '; ',
                          names[:"#{gene_id}"].uniq.sort * '; ')
      end
      file_types.close
    end

    # Parses a tab-seperated file into a hash, which assigns biotypes to
    # gene_ids. This file will be used to process FeatureCounts output.
    #
    # filename - input file
    #
    # Returns hash.
    def self.read_index(filename)
      features = {}
      File.readlines(filename).each do |line|
        key = line.split("\t")[0]
        values = line.split("\t")[1].strip.split('; ')
        features[:"#{key}"] = values
      end

      features
    end

    # Computes feature distribution from feature counts output
    #
    # infile     - input file
    # outdistri  - output file containing ft distribution
    # outsums    - output file containing ft distribution sums
    # annotation - genomic annotation
    def self.compute_distri(infile, outdistri, outsums, annotation)
      fail "#{infile} does not exist." unless File.exist?(infile)

      # build feature index for annotation, if non-existent
      write_index(annotation) unless File.exist?("#{annotation}.types")

      # read feature index into hash
      features = read_index("#{annotation}.types")
      cumulative_sums = {}

      # write feature distribution
      file_distri = File.open(outdistri, 'w')
      File.readlines(infile).drop(1).each do |line|
        gene_id = line.split[0]
        type = features[:"#{gene_id}"].nil? ? 'nil' : features[:"#{gene_id}"] * '; '
        num = line.split[6].to_i
        next if num < 0
        file_distri.printf("%s\t%s\t%d\n", gene_id, type, num)
        if cumulative_sums[:"#{type}"].nil?
          cumulative_sums[:"#{type}"] = num
        else
          cumulative_sums[:"#{type}"] += num
        end
      end
      file_distri.close

      # write cumulative sums in descending order
      file_sums = File.open(outsums, 'w')
      cumulative_sums.sort_by(&:last).reverse_each do |type, num|
        file_sums.printf("%s\t%d\n", type, num)
      end
      file_sums.close
    end

    # Wrapper function. Runs feature counts and computes field distribution
    #
    # infile     - input file
    # outfile    - output file
    # annotation - genomic annotation (gtf)
    # target     - gtf feature type (CDS, exon, gene, Selenocysteine,
    #              start_codon, stop_codon, transcript, UTR)
    def self.compute(infile, outfile, annotation, target = 'exon')
      fail "#{infile} does not exist." unless File.exist?(infile)
      run_cmd(
        "featureCounts -t #{target} -a #{annotation} -o #{outfile} #{infile}"
      )
    end
  end
end