lib/sequenceserver/makeblastdb.rb
require 'find'
require 'forwardable'
module SequenceServer
# Smart `makeblastdb` wrapper: recursively scans database directory determining
# which files need to be formatted or re-formatted.
#
# Example usage:
#
# makeblastdb = MAKEBLASTDB.new(database_dir)
# makeblastdb.run # formats and re-formats databases in database_dir
# makeblastdb.formatted_fastas # lists formatted databases
#
class MAKEBLASTDB
extend Forwardable
GUESS_SAMPLE_SIZE = 1_048_576
def_delegators SequenceServer, :config, :sys
def initialize(database_dir)
@database_dir = database_dir
end
attr_reader :database_dir
def any_formatted?
formatted_fastas.any?
end
def any_to_format_or_reformat?
any_to_format? || any_to_reformat?
end
def no_fastas?
probably_fastas.empty?
end
# Runs makeblastdb on each file in `fastas_to_format` and
# `fastas_to_reformat`.
def run
format
reformat
end
# Format any unformatted FASTA files in database directory. Returns Array
# of files that were formatted.
def format
# Make the intent clear as well as ensure the program won't crash if we
# accidentally call format before calling scan.
return unless any_to_format?
fastas_to_format.select do |path, title, type|
make_blast_database('format', path, title, type)
end
end
# Re-format databases that require reformatting. Returns Array of files
# that were reformatted.
def reformat
# Make the intent clear as well as ensure the program won't crash if
# we accidentally call reformat before calling scan.
return unless any_to_reformat?
fastas_to_reformat.select do |path, title, type, non_parse_seqids|
make_blast_database('reformat', path, title, type, non_parse_seqids)
end
end
# Determines which FASTA files in the database directory are already
# formatted.
def formatted_fastas
return @formatted_fastas if defined?(@formatted_fastas)
@formatted_fastas = []
blastdbcmd.each_line do |line|
path, *rest = line.chomp.split("\t")
next if multipart_database_name?(path)
rest << get_categories(path)
@formatted_fastas << Database.new(path, *rest)
end
@formatted_fastas
end
def any_to_format?
fastas_to_format.any?
end
private
def any_to_reformat?
fastas_to_reformat.any?
end
# Determines which FASTA files in the database directory require
# reformatting.
def fastas_to_reformat
return @fastas_to_reformat if defined?(@fastas_to_reformat)
@fastas_to_reformat = []
formatted_fastas.each do |ff|
@fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?] if ff.v4? || ff.non_parse_seqids?
end
@fastas_to_reformat
end
# Determines which FASTA files in the database directory are
# unformatted.
def fastas_to_format
return @fastas_to_format if defined?(@fastas_to_format)
formatted_fasta_paths = formatted_fastas.map { |f| f[0] }
fasta_paths_to_format = probably_fastas - formatted_fasta_paths
@fastas_to_format = fasta_paths_to_format.map do |path|
[
path,
make_db_title(path),
guess_sequence_type_in_fasta(path)
]
end
@fastas_to_format
end
def probably_fastas
return @probably_fastas if defined?(@probably_fastas)
@probably_fastas = []
Find.find(database_dir + '/') do |path|
next if File.directory?(path)
@probably_fastas << path if probably_fasta?(path)
end
@probably_fastas
end
# Runs `blastdbcmd` to determine formatted FASTA files in the database
# directory. Returns the output of `blastdbcmd`. This method is called
# by `determine_formatted_fastas`.
def blastdbcmd
cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \
' -list_outfmt "%f %t %p %n %l %d %v"'
out, err = sys(cmd, path: config[:bin])
errpat = /BLAST Database error/
fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)
out
rescue CommandFailed => e
raise BLAST_DATABASE_ERROR.new(cmd, e.stderr)
end
# Create BLAST database, given FASTA file and sequence type in FASTA file.
def make_blast_database(action, file, title, type, non_parse_seqids = false)
return unless make_blast_database?(action, file, type)
title = confirm_database_title(title)
extract_fasta(file) unless File.exist?(file)
taxonomy = taxid_map(file, non_parse_seqids) || taxid
_make_blast_database(file, type, title, taxonomy)
end
# Show file path and guessed sequence type to the user and obtain a y/n
# response.
#
# Returns true if the user entered anything but 'n' or 'N'.
def make_blast_database?(action, file, type)
puts
puts
puts "FASTA file to #{action}: #{file}"
puts "FASTA type: #{type}"
print 'Proceed? [y/n] (Default: y): '
response = STDIN.gets.to_s.strip
!response.match(/n/i)
end
# Show the database title that we are going to use to the user for
# confirmation.
#
# Returns user input if any. Auto-determined title otherwise.
def confirm_database_title(default)
print "Enter a database title or will use '#{default}': "
from_user = STDIN.gets.to_s.strip
from_user.empty? && default || from_user
end
# Check if a '.taxid_map.txt' file exists. If not, try getting it
# using blastdbcmd.
def taxid_map(db, non_parse_seqids)
return if non_parse_seqids
taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
extract_taxid_map(db, taxid_map) unless File.exist?(taxid_map)
"-taxid_map #{taxid_map}" unless File.zero?(taxid_map)
end
# Get taxid from the user. Returns user input or 0.
#
# Using 0 as taxid is equivalent to not setting taxid for the database
# that will be created.
def taxid
default = 0
print 'Enter taxid (optional): '
user_response = STDIN.gets.strip
"-taxid #{user_response.empty? && default || Integer(user_response)}"
rescue ArgumentError # presumably from call to Interger()
puts 'taxid should be a number'
retry
end
def _make_blast_database(file, type, title, taxonomy)
cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
" -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
" #{taxonomy}"
output = if File.directory?(file)
File.join(file, 'makeblastdb')
else
"#{file}.makeblastdb"
end
out, err = sys(
cmd,
path: config[:bin],
stderr: [output, 'stderr'].join,
stdout: [output, 'stdout'].join
)
puts out.strip
puts err.strip
true
rescue CommandFailed => e
puts <<~MSG
Could not create BLAST database for: #{file}
Tried: #{cmd}
stdout: #{e.stdout}
stderr: #{e.stderr}
MSG
exit!
end
# Extract FASTA file from BLAST database.
#
# Invoked while reformatting a BLAST database if the corresponding
# FASTA file does not exist.
def extract_fasta(db)
puts
puts 'Extracting sequences ...'
cmd = "blastdbcmd -entry all -db #{db}"
sys(cmd, stdout: db, path: config[:bin])
rescue CommandFailed => e
puts <<~MSG
Could not extract sequences from: #{db}
Tried: #{cmd}
stdout: #{e.stdout}
stderr: #{e.stderr}
MSG
exit!
end
def extract_taxid_map(db, taxmap_file)
cmd = "blastdbcmd -entry all -db #{db} -outfmt '%i %T'"
sys(cmd, stdout: taxmap_file, path: config[:bin])
rescue CommandFailed => e
# silence error
end
# Returns true if the database name appears to be a multi-part database
# name.
#
# e.g.
# /home/ben/pd.ben/sequenceserver/db/nr.00 => yes
# /home/ben/pd.ben/sequenceserver/db/nr => no
# /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes
# /home/ben/pd.ben/sequenceserver/db/nr00 => no
# /mnt/blast-db/refseq_genomic.100 => yes
def multipart_database_name?(db_name)
!db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?
end
def get_categories(path)
path
.gsub(config[:database_dir], '') # remove database_dir from path
.split('/')
.reject(&:empty?)[0..-2] # the first entry might be '' if database_dir does not end with /
end
# Returns true if first character of the file is '>'.
def probably_fasta?(file)
unless file.match(/((cdna)|(cds)|(dna)|(fa)|(faa)|(fas)|(fasta)|(fna)|(genome)|(nt)|(nuc)|(pep)|(prot))$/i)
return false
end
File.read(file, 1) == '>'
end
# Suggests improved titles when generating database names from files
# for improved apperance and readability in web interface.
# For example:
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
def make_db_title(path)
db_name = File.basename(path)
db_name.tr!('"', "'")
# removes .fasta like extension names
db_name.gsub!(File.extname(db_name), '')
# replaces _ with ' ',
db_name.gsub!(/(_)/, ' ')
# replaces '.' with ' ' when no numbers are on either side,
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
# preserves version numbers
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
db_name
end
# Guess whether FASTA file contains protein or nucleotide sequences by
# sampling a few few characters of the file.
def guess_sequence_type_in_fasta(file)
sequences = sample_sequences(file)
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
sequence_types = sequence_types.uniq.compact
(sequence_types.length == 1) && sequence_types.first
end
# Read first 1,048,576 characters of the file, split the read text on
# fasta def line pattern and return.
#
# If the given file is FASTA, returns Array of as many different
# sequences in the portion of the file read. Returns the portion
# of the file read wrapped in an Array otherwise.
def sample_sequences(file, offset = 0)
sample = File.read(file, GUESS_SAMPLE_SIZE, offset)
return [] if sample.nil?
# remove all unknown bases (indicated by 'N') before sampling
sample.gsub!(/N/, '')
meaningful_samples = sample.split(/^>.+$/).map { |line| line.gsub(/^\n+$/, '') }.delete_if(&:empty?)
if meaningful_samples.empty?
offset += GUESS_SAMPLE_SIZE
sample_sequences(file, offset)
else
meaningful_samples
end
end
end
end