lib/biotcm/apps/string_processor.rb
# To extract ppi network from STRING
#
# = Example Usage
# BioTCM::Apps::StringProcessor.new(
# 'protein.links.detailed.v10.txt',
# 'species.v10.txt'
# ).extract_by_species(
# 'protein.links.detailed.v10.homo.sapiens.txt', 'Homo sapiens'
# )
#
class BioTCM::Apps::StringProcessor
# Version of StringProcessor
VERSION = '0.1.0'.freeze
# Open STRING data files
# @param protein_links_filepath [String]
# @param species_filepath [String]
def initialize(protein_links_filepath, species_filepath)
@f_protein_links = File.open(protein_links_filepath)
@f_species = File.open(species_filepath)
end
# Check given STRING network file
def check
species = []
counter = 0
@f_protein_links.pos = 0
@f_protein_links.each do |line|
col = line.chomp!.split("\t")
/^(?<id>\d+)\./ =~ col[0]
if id != species.last
puts "Processing Species No.#{id}..."
species << id
end
counter += 1
end
puts "Total #{species.size} kinds of species"
puts "Total #{counter} lines"
end
# Extract ppi network by species
# @param filepath [String] path to output
# @param species [String/Integer] species name or ID
def extract_by_species(filepath, species = 'Homo sapiens')
fout = File.new(filepath, 'w')
raise ArgumentError, 'Illegal filepath given' unless fout
species = find_species_id(species).to_i
raise ArgumentError, 'Illegal species given' unless species > 0
# Start from head of the file
counter = 0
@f_protein_links.pos = 0
# Jump to target lines
until @f_protein_links.gets =~ /^#{species}\./
@f_protein_links.pos += 500_000
@f_protein_links.gets # finish reading current line
end
@f_protein_links.pos -= 501_000
@f_protein_links.gets
# Start to extract
@f_protein_links.each do |line|
col = line.chomp.split(' ')
col[0] =~ /(?<species_id>\d+)\.(?<protein_id>.*)$/
next if species_id.to_i < species
break if species_id.to_i > species
# Handle proteins' names
col[0] = protein_id
col[1] =~ /\d+\.(?<protein_id>.*)$/
col[1] = protein_id
fout.puts col.join("\t")
counter += 1
end
puts "Total #{counter} PPIs extracted"
fout.close
end
private
# Find species id by taxon_id, STRING_name_compact or official_name_NCBI
def find_species_id(species)
pattern = Regexp.new(species)
@f_species.pos = 0
@f_species.gets # Title line
@f_species.each do |line|
col = line.chomp.split("\t")
[col[0], col[2], col[3]].each do |str|
return col[0] if pattern =~ str
end
end
nil
end
end