biotcm/biotcm

View on GitHub
lib/biotcm/databases/hgnc/parser.rb

Summary

Maintainability
B
5 hrs
Test Coverage
module BioTCM::Databases::HGNC
  # Parser module
  module Parser
    # Symbol who may refer to more than one gene is removed from the dictionary
    # but listed here paired with the corresponding official symbol, unless it's
    # an official one.
    # @return [Hash]
    # @example
    #   HGNC.ambiguous_symbol.keys & HGNC.symbol2hgncid.keys # results are all ambiguous official symbols
    def ambiguous_symbol
      @ambiguous_symbol
    end

    # Load an HGNC table from IO
    # @param fin [#gets, #each] Typically a File or IO
    # @private
    def parse(fin)
      # Headline
      names = fin.gets.chomp.split("\t")
      index2identifier = {}
      index_hgncid = nil
      BioTCM::Databases::HGNC::IDENTIFIERS.each do |identifer, name|
        if identifer == :hgncid
          index_hgncid = names.index(name)
        elsif name.is_a?(String)
          index2identifier[names.index(name)] = identifer if names.index(name)
        else
          name.each_with_index do |n, i|
            next unless names.index(n)
            index2identifier[names.index(n)] = (i == 0 ? identifer : identifer.to_s)
          end
        end
      end

      # Dynamically bulid a line processor
      process_one_line = index2identifier.collect do |index, identifer|
        # Symbol will be mapped to single item
        if identifer.is_a?(Symbol)
          %(
            unless column[#{index}] == nil || column[#{index}] == "" || column[#{index}] == "-"
              @#{identifer}2hgncid[column[#{index}]] = column[#{index_hgncid}]
              @hgncid2#{identifer}[column[#{index_hgncid}]] = column[#{index}]
            end
          )
        # Others will be mapped to list item
        else
          %{
            unless column[#{index}] == nil
              column[#{index}].split(", ").each do |id|
          } +
            if identifer == 'symbol'
              %(
                if @ambiguous_symbol[id]
                  @ambiguous_symbol[id] << @hgncid2symbol[column[#{index_hgncid}]]
                elsif @symbol2hgncid[id].nil?
                  @symbol2hgncid[id] = column[#{index_hgncid}]
                else
                  @ambiguous_symbol[id] = [@hgncid2symbol[column[#{index_hgncid}]]]
                  unless @hgncid2symbol[@symbol2hgncid[id]] == id
                    @ambiguous_symbol[id] << @hgncid2symbol[@symbol2hgncid[id]]
                    @symbol2hgncid.delete(id)
                  end
                end
              )
            else
              %(
                @#{identifer}2hgncid[id] = column[#{index_hgncid}] if @#{identifer}2hgncid[id].nil?
              )
            end +
            %(
              end
            end
            )
        end
      end.join

      # Process the content
      eval %{fin.each do |line|\n column = line.chomp.split("\\t", -1)} + process_one_line + 'end' # rubocop:disable Lint/Eval
    end

    def self.extended(mod)
      # Initialize the instance variable
      mod.instance_variable_set(:@ambiguous_symbol, {})
    end
  end
end