stepf/RiboPip

View on GitHub
lib/ribopip/gene_id_2_name.rb

Summary

Maintainability
A
1 hr
Test Coverage
module Ribopip
  # Loads gene names from an GTF annotation and adds them to any desired
  # delimited text file.
  class GeneID2Name
    # id2name - hash with key-value pairs of gene IDs to gene names
    attr_reader :id2name

    # annot - path to gtf annotation
    def initialize(annot)
      @annot = annot
      @id2name = {}
      @index = "#{@annot}.name_index"
      if File.exist?(@index)
        read_index
      else
        build_index
      end
    end

    # Builds @id2name hash and writes @index file for reusing it
    def build_index
      # parse annotation
      File.readlines(@annot).each do |line|
        next if line[0..1] == '#!'
        gene_id = line[/gene_id\W+\w+/].delete('"').split[1]
        gene_name = line[/gene_name\W+\w+/].delete('"').split[1]
        next if gene_name.nil?
        @id2name[gene_id.to_sym] = gene_name
      end

      # write index
      index = File.open(@index, 'w')
      @id2name.each do |gene_id, gene_name|
        index.printf("%s\t%s\n", gene_id, gene_name)
      end
      index.close
    end

    # Reads pre-built index
    def read_index
      File.readlines(@index).each do |line|
        gene_id, gene_name = line.chomp.split("\t")
        @id2name[gene_id.to_sym] = gene_name
      end
    end

    # Translates gene_ids in tab-seperated files to gene_names and writes an
    # additional column into a new file.
    #
    # id_col   - column containing the ID, starting with 0
    # infile   - input file
    # outfile  - output file
    def translate(id_col, infile, outfile, delimiter = "\t")
      out = File.open(outfile, 'w')
      File.readlines(infile).each do |line|
        inelems = line.split(delimiter)
        gene_id = inelems[id_col]
        if gene_id.nil?
          out.puts
          next
        end
        if !gene_id.empty?
          gene_id.delete!('"')
          gene_name = @id2name[gene_id.to_sym].nil? ? ' ' : @id2name[gene_id.to_sym]
        else # not available
          gene_name = ''
        end
        inelems[id_col] = [gene_id, gene_name]
        out.puts inelems.flatten!.join(delimiter)
      end
      out.close
    end
  end
end