lib/mspire/ident/peptide/db/creator.rb
require 'optparse'
require 'mspire/digester'
require 'mspire/fasta'
require 'mspire/ident/peptide/db'
class Mspire::Ident::Peptide::Db::Creator
MAX_NUM_AA_EXPANSION = 3
# the twenty standard amino acids
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
EXPAND_AA = {'X' => STANDARD_AA}
DEFAULT_PEPTIDE_CENTRIC_DB = {
missed_cleavages: 2,
min_length: 5,
enzyme: Mspire::Digester[:trypsin],
remove_digestion_file: true,
cleave_initiator_methionine: true,
expand_aa: false,
uniprot: true,
data_type: :hash,
}
# sets defaults according to DEFAULT_PEPTIDE_CENTRIC_DB
def self.cmdline(argv)
opt = DEFAULT_PEPTIDE_CENTRIC_DB.dup
opts = OptionParser.new do |op|
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
op.separator "output: "
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
op.separator "format:"
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
op.separator ""
op.separator " Initiator Methionines - by default, will generate two peptides"
op.separator " for any peptide found at the N-termini starting with 'M'"
op.separator " (i.e., one with and one without the leading methionine)"
op.separator ""
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
op.on("--expand-x", "enumerate all aa possibilities for 'X'", "(default is to remove these peptides)") {|v| opt[:expand_aa] = v }
op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false }
#op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v }
op.on("--data-type <#{opt[:data_type]}>", "hash|google_hash|trie", "need google_hash or fast_trie gem installed") {|v| opt[:data_type] = v.to_sym }
op.on("-e", "--enzyme <#{opt[:enzyme].name}>", "enzyme for digestion (Mspire::Digester[name])") {|v| opt[:enzyme] = Mspire::Digester[v] }
op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 }
op.on("--list-enzymes", "lists approved enzymes and exits") do
op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 }
puts Mspire::Digester::ENZYMES.keys.join("\n")
exit
end
end
opts.parse!(argv)
if argv.size == 0
puts opts || exit
end
argv.map do |file|
creator = Mspire::Ident::Peptide::Db::Creator.new
creator.create(file, opt)
end
end
# returns the name of the digestion file that was written
def create_digestion_file(fasta_file, opts={})
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
(missed_cleavages, enzyme, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :enzyme, :cleave_initiator_methionine, :expand_aa)
start_time = Time.now
print "Digesting #{fasta_file} ..." if $VERBOSE
letters_to_expand_re = Regexp.new("[" << Regexp.escape(EXPAND_AA.keys.join) << "]")
base = fasta_file.chomp(File.extname(fasta_file))
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
File.open(digestion_file, "w") do |fh|
Mspire::Fasta.open(fasta_file) do |fasta|
fasta.each do |prot|
peptides = enzyme.digest(prot.sequence, missed_cleavages)
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
m_peps = []
init_methionine_peps = []
peptides.each do |pep|
# if the peptide is at the beginning of the protein sequence
if prot.sequence[0,pep.size] == pep
m_peps << pep[1..-1]
end
end
peptides.push(*m_peps)
end
peptides =
if expand_aa
peptides.flat_map do |pep|
(pep =~ letters_to_expand_re) ? expand_peptides(pep, EXPAND_AA) : pep
end
else
peptides.select {|pep| pep !~ letters_to_expand_re }
end
header = prot.header
id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s/,2).first
fh.puts( id + "\t" + peptides.join(" ") )
end
end
end
puts "#{Time.now - start_time} sec" if $VERBOSE
digestion_file
end
# returns the full path of the created file
def db_from_fasta_digestion_file(digestion_file, opts={})
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
start_time = Time.now
puts "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
puts "#{Time.now - start_time} sec" if $VERBOSE
hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:data_type])
base = digestion_file.chomp(File.extname(digestion_file))
final_outfile =
if opts[:data_type] == :trie
base + ".min_aaseq#{opts[:min_length]}"
else
base + ".min_aaseq#{opts[:min_length]}" + ".yml"
end
start_time = Time.now
print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE
if opts[:data_type] == :trie
trie = hash_like
trie.save(final_outfile)
else
File.open(final_outfile, 'w') do |out|
hash_like.each do |k,v|
out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) )
#out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}"
end
end
end
puts "#{Time.now - start_time} sec" if $VERBOSE
if opts[:remove_digestion_file]
File.unlink(digestion_file)
end
File.expand_path(final_outfile)
end
def get_a_trie
begin
require 'trie'
rescue
raise LoadError, "must first install fast_trie"
end
Trie.new
end
# data_type can be :hash (default), :google_hash (better memory and speed),
# or :trie (experimental/broken)
def hash_like_from_digestion_file(digestion_file, min_length, data_type=:hash)
case data_type
when :trie
trie = get_a_trie
::IO.foreach(digestion_file) do |line|
line.chomp!
(prot, *peps) = line.split(/\s+/)
# prot is something like this: "P31946"
peps.uniq!
peps.each do |pep|
if pep.size >= min_length
if trie.has_key?(pep)
ar = trie.get(pep)
ar << prot
else
trie.add( pep, [prot] )
end
end
end
end
trie
when :hash, :google_hash
hash =
case data_type
when :hash
Hash.new
when :google_hash
require 'google_hash'
# this guy is very slow compared to ruby hash (10X slower?) but more
# memory efficient (but only ~10% or so improvement with this
# application)
GoogleHashDenseRubyToRuby.new
# still need to try Sparse one out and compare performance
#GoogleHashSparseRubyToRuby.new
end
::IO.foreach(digestion_file) do |line|
line.chomp!
(prot, *peps) = line.split(/\s+/)
# prot is something like this: "P31946"
peps.uniq!
peps.each do |pep|
if pep.size >= min_length
if hash.key?(pep)
hash[pep] << prot
else
hash[pep] = [prot]
end
end
end
end
hash
end
end
# writes a new file with the added 'min_aaseq<Integer>'
# creates a temporary digestion file that contains all peptides digesting
# with certain missed_cleavages (i.e., min_seq_length is not applied to
# this file but on the final peptide centric db)
# returns the full name of the written file.
def create(fasta_file, opts={})
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
digestion_file = create_digestion_file(fasta_file, opts)
puts "created file of size: #{File.size(digestion_file)}" if $VERBOSE
db_from_fasta_digestion_file(digestion_file, opts)
end
# does combinatorial expansion of all letters requesting it.
# expand_aa is hash like: {'X'=>STANDARD_AA}
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
# be expanded
# returns an empty array if there is no expansion
def expand_peptides(peptide, expand_aa_hash)
letters_in_order = expand_aa_hash.keys.sort
index_and_key = []
peptide.split('').each_with_index do |char,i|
if let_index = letters_in_order.index(char)
index_and_key << [i, letters_in_order[let_index]]
end
end
if index_and_key.size > MAX_NUM_AA_EXPANSION
return nil
end
to_expand = [peptide]
index_and_key.each do |i,letter|
new_peps = []
while current_pep = to_expand.shift do
new_peps << expand_aa_hash[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
end
to_expand = new_peps.flatten
end
to_expand
end
end