generate_comparison_scores.rb
require 'mspire/mzml'
require 'mspire/peaklist'
require 'mspire/spectrum'
require 'mspire/bin'
require 'optparse'
require 'pry'
PPM_BIN_DEFAULT = 10
MAXQ = 40
VERBOSE = false
options = {ppm: PPM_BIN_DEFAULT, qmax: MAXQ}
opt_parse = OptionParser.new do |opts|
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
opts.banner = "Usage: #{__FILE__} truth_file.mzML comparison_file.mzML ... truth_file[n].mzML comparison_file[n].mzML"
opts.separator "Takes sets of spectral files and kicks out comparison stats for them"
opts.separator "Output: truth_file_comparison_file.csv ... truth_file[n]_comparison_file[n].csv"
opts.on("-v", "--verbose", "Verbose mode") do |v|
VERBOSE = v
end
opts.on("-p", "--ppm N", Float, "Set the PPM tolerance for searches") do |p|
options[:ppm] = p
end
opts.on("-q", "--qmax N", Integer, "Set the maximum Q value for searches") do |q|
options[:qmax] = q
end
opts.on("-o", "--output_directory STRING", String, "Set an output directory for all files written") do |o|
options[:output_dir] = o
end
opts.on("-n", "--no_file", "Don't output a csv file, instead, just output the score row") do |n|
options[:do_not_output_csv] = n
end
end
opt_parse.parse!(ARGV)
if ARGV.size % 2 != 0 or ARGV.size == 0
puts opt_parse
exit
end
def putsv(thing)
puts thing if VERBOSE
end
def ppm(m1,m2)
(m2-m1)/m1*1e6
end
def ppm_range(mass, ppm = PPM_BIN_DEFAULT)
range = (ppm*mass)/1e6
(mass-range..mass+range)
end
def ppm_between(m1, m2)
((m2-m1)/m1*1e6).abs
end
def create_bins(true_values, comparison_values, bin_width: PPM_BIN_DEFAULT)
min, max = [true_values.minmax, comparison_values.minmax].flatten.minmax
divisions = []
puts "using bin width: #{bin_width}" if $VERBOSE
puts "using ppm for bins: #{use_ppm}" if $VERBOSE
current_x = min
loop do
if current_x >= max
divisions << max
break
else
divisions << current_x
current_x += current_x./(1e6).*(bin_width)
end
end
# make each bin exclusive so there is no overlap
bins = divisions.each_cons(2).map {|pair| Mspire::Bin.new(*pair, true) }
# make the last bin *inclusive* of the terminating value
bins[-1] = Mspire::Bin.new(bins.last.begin, bins.last.end)
bins
end
def simple_matching(true_values, comparison_values, bin_width: PPM_BIN_DEFAULT)
bins = create_bins(true_values, comparison_values, bin_width: bin_width)
puts "MIGHT WANT TO ADJUST YOUR BIN SIZE... this might take a while! " if bins.size > 2_000_000
matches_bins = create_bins(true_values, comparison_values, bin_width: bin_width)
# Truth and Experiment bins
t_e_bins = create_bins(true_values, comparison_values, bin_width: bin_width)
Mspire::Bin.bin(bins, true_values)
bins.reject! {|a| a.data.empty?}
Mspire::Bin.bin(t_e_bins, true_values)
Mspire::Bin.bin(t_e_bins, comparison_values)
t_e_bins.select! {|a| a.data.size > 1}
Mspire::Bin.bin(matches_bins, comparison_values)
matches_bins.reject! {|a| a.data.empty?}
[true_values, t_e_bins, comparison_values]
end
def find_top_peaks_in_window(spectrum, window_bottom, window_top, q_max)
putsv "window_bottom: #{window_bottom}"
putsv "window_top: #{window_top}"
bottom = spectrum.find_nearest_index(window_bottom)
top = spectrum.find_nearest_index(window_top)
putsv "window: #{spectrum[bottom..top]}"
output = spectrum[bottom..top].transpose.sort_by(&:last).reverse[0..q_max]
range = window_bottom..window_top
output.select {|a| range.include?(a.first) }
end
def divide_masses_into_windows(masses)
min, max = masses.minmax
# return window start values to divide the spectrum into ~evenly sized windows < 100 Th
range = max-min
bins = (range / 100.0).ceil
size = range / bins.to_f
resp = (bins).times.map {|i| min+i*size}
resp << max
end
def matching(gold_standard_spectrum, tested_spectrum, ppm: PPM_BIN_DEFAULT, q_max: MAXQ)
# Only need masses
tested_masses = tested_spectrum.mzs
gold_standard_masses = gold_standard_spectrum.mzs
# Filter each and every 100 m/z region into top Q peaks (optimized by score)
windows_for_q = divide_masses_into_windows(tested_masses)
top_peak_lists = windows_for_q.each_cons(2).map {|b, t| find_top_peaks_in_window(tested_spectrum, b, t, q_max)}
q_lists = (1..q_max).to_a.map do |q|
# return a list of the top q peaks per region
top_peak_lists.map{|arr| arr[0...q]}.flatten(1)
end
# Matched by ppm or AMU tolerance
matches = q_lists.map.with_index do |list, i|
scores = list.map do |peak|
# finds closest match in the GS spectrum
closest_match = gold_standard_spectrum[gold_standard_spectrum.find_nearest_index(peak.first)]
#putsv "Potential problem with #{closest_match}: bigger peak nearby" if larger_peaks_nearby(gold_standard_spectrum, closest_match)
ppm_between(peak.first,closest_match.first) < ppm ? peak.first : nil
end.compact
end
matches
end
#testing
# p simple_matching([350.0, 400.00, 450.0],[375.0], bin_width: 5)
# p simple_matching([350.0, 400.00, 450.0],[350.0], bin_width: 5)
# esp = Mspire::Spectrum.new [[200.0198,200.01998, 200.0199998,205.35],[123,2123,2134,22351]]
# tsp = Mspire::Spectrum.new [[200.02,205.35,232.34,268.245,270.24],Array.new(5,1000)]
# p matching(tsp, esp, q_max: 5)
# Takes arrays of mz values
def calculate_f1_score(trues, matches, tests)
# sensitivity/precision
false_positives = tests - trues
false_negatives = trues - matches
precision = matches/(matches + false_positives).to_f || 0
sensitivity = matches/trues.to_f
ans = 2*(precision*sensitivity)/(precision+sensitivity) || 0
ans = 0 if ans.nan?
ans
end
def stats_from_matches(trues, matches, attempts)
[trues,matches,attempts].map(&:size)
end
def optimize_f1_score(gss, cs, ppm: PPM_BIN_DEFAULT, q_max: MAXQ)
matches = matching(gss, cs, ppm: ppm, q_max: q_max)
scores = (1..q_max).to_a.map.with_index do |q,i|
[calculate_f1_score(*stats_from_matches(gss.mzs, matches[i], cs.mzs))*q_max/q.to_f, q]
end
scores.sort_by {|a| a.first}.reverse
end
## ANDROMEDA
require_relative 'andromeda'
# This gives the class AndromedaPscore
# call the function #optimize to grab the score and q for the best match possible
def spectral_contrast_angle(s1, s2)
# This is from http://www.sciencedirect.com/science/article/pii/S1044030501003270
# similarity index, and spectral angle...
end
## HIT COUNT = X*hits-Y*misses+X*unpredicted where X is a scaling factor 10>=X>=1 and Y is a scaling factor 2>X>=1
# hits
def hit_count_from_numbers(hits, misses, unpredicted, x: 10, y: 1, z: 1)
x*hits - misses*y - z*unpredicted
end
def calculate_hit_count(matches, predicted, q)
match_count = matches.size
total_checked = predicted.size
hit_count_from_numbers(match_count, total_checked-match_count, q-total_checked)
end
def optimize_hit_count(gss,cs, q_max: MAXQ, ppm: PPM_BIN_DEFAULT)
matches = matching(gss,cs, ppm: ppm, q_max: q_max)
scores = (1..q_max).to_a.map.with_index do |q,i|
[calculate_hit_count(matches[i], cs.mzs, q), q]
end
scores.sort_by(&:first).reverse
end
# RUN ANALYSIS
ARGV.each_slice(2) do |ground_truth_file, comparison_file|
gss = Mspire::Mzml.open(ground_truth_file) {|mzml| mzml[0]}
cs = Mspire::Mzml.open(comparison_file) {|m| m[0]}
outfile = [ground_truth_file, comparison_file].map {|f| File.basename(f).gsub(File.extname(f),'')}.join('_') + '.csv'
outfile = File.join(options[:output_dir],outfile) if options[:output_dir]
p outfile
File.open(outfile, 'w') do |io|
output = []
output << %w(truth_file comparison_file hits @q_hits misses, unpredicted, optimizedF1-score @qdepth andromedaPscore @q_depth hitcountscore @q_h_depth).join(",")
gss = Mspire::Mzml.open(ground_truth_file) {|mzml| mzml[0]}
cs = Mspire::Mzml.open(comparison_file) {|m| m[0]}
hits, q_hit = matching(gss,cs,ppm: options[:ppm], q_max: options[:qmax]).map.with_index {|a,i| [a.size,i]}.sort_by(&:first).reverse.first
predicted = gss.mzs.size
puts "Hits: #{hits} @ q=#{q_hit}"
# SUPER SLOW!!!!!!!
# f1 = calculate_f1_score(*stats_from_matches(*simple_matching(gss.mzs, cs.mzs, bin_width: options[:ppm])))
# ANDROMEDA
a_score, a_q = AndromedaPscore.optimize(gss, cs, ppm: options[:ppm], q_max: options[:qmax]).first
# F1 SCORE
optimized_f1, q_f1 = optimize_f1_score(gss, cs, ppm: options[:ppm], q_max: options[:qmax]).first
#HITCOUNT
hitcount, h_q = optimize_hit_count(gss, cs, ppm: options[:ppm], q_max: options[:qmax]).first
# SO SLOW I MUST AVOID IT... and since it seems less discerning
# puts "F1: #{f1}"
puts "optimized_F1: #{optimized_f1} @ q=#{q_f1}"
puts "Andromeda: #{a_score} @ q=#{a_q}"
puts "HitCount: #{hitcount} @ q=#{h_q}"
output << [ground_truth_file, comparison_file, hits, q_hit, predicted-hits, q_hit-hits-predicted, optimized_f1, q_f1, a_score, h_q, hitcount, h_q].join(",")
io.puts output unless options[:do_not_output_csv]
end
end
# Testing stuff
if false #$0 == __FILE__
p calculate_f1_score(Array.new(40),Array.new(3), Array.new(5))
bins = simple_matching([350.0, 400.00, 450.0],[350.0,400.0,355.0,424.0], bin_width: 500)
p bins
p bins.first.size
p calculate_f1_score(*bins)
esp = Mspire::Spectrum.new [[200.0199,431.86,205.35,700],[123,21023,2134,22351]]
tsp = Mspire::Spectrum.new [[200.02,205.35,232.34,268.245,270.24],Array.new(5,1000)]
p AndromedaPscore.optimize(tsp,esp)
p calculate_f1_score(*simple_matching(esp.mzs, tsp.mzs))
end