stepf/RiboPip

View on GitHub
lib/ribopip/cli.rb

Summary

Maintainability
D
2 days
Test Coverage
require 'thor'
require 'ribopip'

# rubocop:disable all
module Ribopip
  # the command line interface
  class CLI < Thor
    include Ribopip

    desc 'align', 'runs alignment pipeline and generates corresponding metrics'
    method_option :reads,
      :aliases => '-r',
      :banner => 'FASTQ',
      :desc => 'Path to reads.',
      :required => true,
      :type => :string
    method_option :ncrna_ref,
      :aliases => '-n',
      :banner => 'FASTA',
      :desc => 'Path to ncRNA reference.',
      :required => true,
      :type => :string
    method_option :genomic_ref,
      :aliases => '-g',
      :banner => 'FASTA',
      :desc => 'Path to genomic reference.',
      :required => true,
      :type => :string
    method_option :genomic_annotation,
      :aliases => '-a',
      :banner => 'GTF',
      :desc => 'Path to genomic annotation.',
      :required => true,
      :type => :string
    method_option :clip_linker,
      :aliases => '-l',
      :banner => 'STRING',
      :default => 'CTGTAGGCACCATCAAT',
      :desc => 'String to be clipped from 3\'',
      :type => :string
    method_option :clip_software,
      :banner => 'PROGRAM',
      :enum => ['fastx', 'cutadapt'],
      :default => 'cutadapt',
      :desc => 'Clipping software',
      :type => :string
    method_option :clip_err_rate,
      :banner => 'FLOAT',
      :default => 0.2,
      :desc => 'Maximum allowed error rate (FLOAT = #errors / region length).',
      :type => :numeric
    method_option :clip_minlen,
      :banner => 'INT',
      :desc => 'Discard reads shorter than INT.',
      :default => 0,
      :type => :numeric
    method_option :ncrna_software,
      :default => 'bowtie1',
      :banner => 'PROGRAM',
      :desc => 'Software for ncRNA alignment',
      :enum => ['bowtie1', 'bowtie2', 'bwa'],
      :type => :string
    method_option :ncrna_seedlen,
      :banner => 'INT',
      :desc => 'Seed length for bowtie1 / bowtie2 alignment',
      :default => 14,
      :type => :numeric
    method_option :ncrna_minlen,
      :banner => 'INT',
      :desc => 'Discard reads shorter than INT',
      :default => 14,
      :type => :numeric
    method_option :genomic_software,
      :default => 'tophat',
      :banner => 'PROGRAM',
      :desc => 'Software for genomic alignment',
      :enum => ['tophat', 'star'],
      :type => :string
    method_option :genomic_software_tophat,
      :default => 'bowtie1',
      :banner => 'PROGRAM',
      :desc => 'Software for genomic alignment using tophat',
      :enum => ['bowtie2', 'bowtie1'],
      :type => :string
    method_option :genomic_mismatches,
      :banner => 'INT',
      :desc => 'Max num of mismatches for genomic alignment',
      :default => 3,
      :type => :numeric
    method_option :genomic_err_rate,
      :banner => 'FLOAT',
      :desc => 'Set max num of mismatches for genomic alignment relative to ' \
        'read length (#errors = FLOAT * read length), if FLOAT > 0. ' \
        'Will overrule --genomic_mismatches.',
      :default => 0,
      :type => :numeric
    method_option :igv_ref,
      :banner => 'GENOME',
      :desc => 'Path to IGV .genome file. If set TDF track will be computed.',
      :type => :string
    method_option :force_overwrite,
      :desc => 'Overwrite all existing files',
      :default => false,
      :type => :boolean
    # runs alignment pipeline and generates corresponding metrics
    def align
      require 'ribopip/array_writer'
      require 'ribopip/binary_checker'
      require 'ribopip/counts'
      require 'ribopip/feature_counts'
      require 'ribopip/filenames'
      require 'ribopip/pipeline'
      require 'ribopip/metrics'

      bin_array = [
                    ['fastq_illumina_filter', '-h', '0.0.13', 'version'],
                    ['fastx_clipper', '-h', '0.0.13', 'version'],
                    ['fastx_trimmer', '-h', '0.0.13', 'version'],
                    ['cutadapt', '--version', '1.8.1', ''],
                    ['bowtie', '--version', '1.1.2', 'version'],
                    ['bowtie2', '--version', '2.2.5', 'version'],
                    ['bwa', '', '0.7.4', 'Version:'],
                    ['bam2fastq', '-v', '1.1.0', 'bam2fastq v'],
                    ['STAR', '', '2.4.1', 'STAR_'],
                    ['tophat', '--version', '2.0.13', 'TopHat v'],
                    ['fastq-bucketize', '-v', '.0.1', 'fastq-bucketize v.'],
                    ['samtools', '--version', '1.2', 'samtools'],
                    ['fastqc', '--version', '0.10.1', 'FastQC v'],
                    ['gt', '--version', '1.5.5', '(GenomeTools)'],
                    ['featureCounts', '-v', '1.4.4', 'featureCounts v'],
                    ['igvtools', 'version', '2.3.60', 'Version']
                  ]

      checker = Ribopip::BinaryChecker.new(bin_array)
      checker.perform_checks

      begin
        ##
        # INIT
        names = Ribopip::Filenames::Alignment.new(options[:reads])
        counts = Ribopip::Counts::Bookkeeper.new
        counts.parse_nreads(names.get('reads'), 'n_total')

        #Metrics::GtSeqstat.compute(names.get('reads'),
        #                              names.get('readsstat'))

        ##
        # PREPROCESSING: Quality filter, adapter clipping, 5' trimming
        prep = Ribopip::Pipeline::Preprocessing.new(
          names,
          options[:force_overwrite],
          options[:clip_minlen],
          options[:clip_linker],
          options[:clip_software].to_sym,
          options[:clip_err_rate]
        )
        prep.compute

        if options[:clipping_software] == 'cutadapt'
          counts.parse_nreads(
            names.get('cliplog'), 'n_postclip_adapter', 'with adapters'
          )
          counts.parse_nreads(
            names.get('cliplog'), 'n_postclip_length', 'passing filters'
          )
        end

        counts.parse_nreads(names.get('trim'), 'n_posttrim')

        ##
        # ALIGN TO NCRNA REFERENCE: Only unaligning reads are further processed
        ncrna = Ribopip::Pipeline::NcrnaAlignment.new(
          names,
          options[:force_overwrite],
          options[:ncrna_ref],
          options[:ncrna_software].to_sym,
          options[:ncrna_seedlen]
        )
        ncrna.compute
        counts.parse_nreads(names.get('ncrna'), 'n_ncrna_contaminants')
        counts.parse_nreads(names.get('fp'), 'n_reads_of_interest')

        # Compute ncRNA distribution over reference
        Metrics::FieldDistri.compute(
          names.get('ncrna'), names.get('ncrnadist'), options[:ncrna_ref]
        )
        # Compute and plot fp length distribution
        Metrics::GtSeqstat.compute(names.get('fp'), names.get('fpstat'))
        Metrics::GtSeqstat.plot(names.get('fpstat'), names.get('fpstatplot'))

        ##
        # ALIGN TO GENOMIC REF
        genomic =
          Ribopip::Pipeline::GenomicAlignment.new(
            names,
            options[:force_overwrite],
            options[:genomic_ref],
            options[:genomic_software].to_sym,
            options[:genomic_annotation],
            options[:genomic_tophat_engine].to_sym,
            options[:genomic_mismatches],
            options[:genomic_err_rate]
          )
        genomic.compute

        Metrics::FastQC.compute(names.get('mapped_merged'))
        Metrics::FastQC.compute(names.get('unmapped_merged'))
        counts.parse_nreads("#{names.get('mapped_merged')}", 'n_mapped')

        ##
        # EXTRACT UNIQUELY MAPPING READS WITH X MISMATCHES
        extraction =
          Ribopip::Pipeline::ReadExtractor.new(
            names, options[:force_overwrite], genomic.max_mismatches
          )
        extraction.compute

        counts.parse_nreads("#{names.get('mapped_uniq')}", 'n_mapped_uni')
        genomic.max_mismatches.downto(0) do |i|
          counts.parse_nreads(
            "#{names.get('mapped_uni', i)}", "n_mapped_uni_#{i}err"
          )
        end

        FeatureCounts.compute(
          "#{names.get('mapped_uniq')}",
          "#{names.get('ft')}",
          options[:genomic_annotation]
        )
        FeatureCounts.compute_distri(
          "#{names.get('ft')}",
          "#{names.get('ftdist')}",
          "#{names.get('ftsums')}",
          options[:genomic_annotation]
        )
        unless options[:igv_ref].nil?
          Metrics::IGV.compute(
            names.get('mapped_merged'),
            names.get('mapped_igvtdf'),
            names.get('mapped_igvwig'),
            options[:igv_ref]
          )
        end

        counts.parse_nreads("#{names.get('ftlog')}", 'n_assigned', 'Assigned')
        counts.parse_nreads(
          "#{names.get('ftsums')}", 'n_assigned_pc', 'protein_coding'
        )

        # get #reads mapping to CDS or UTR
        FeatureCounts.compute(
          "#{names.get('mapped_uniq')}",
          "#{names.get('ftcds')}",
          options[:genomic_annotation],
          'CDS'
        )
        FeatureCounts.compute(
          "#{names.get('mapped_uniq')}",
          "#{names.get('ftutr')}",
          options[:genomic_annotation],
          'UTR'
        )
        counts.parse_nreads("#{names.get('ftcdslog')}", 'n_cds', 'Assigned')
        counts.parse_nreads("#{names.get('ftutrlog')}", 'n_utr', 'Assigned')
      ensure
        ##
        # WRITE COUNTS
        tsv = Ribopip::ArrayWriter::TSVWriter.new(counts.array, true)
        tsv.write("#{names.get('counts')}.tsv")
        xml = Ribopip::ArrayWriter::XMLWriter.new(counts.array, true)
        xml.write("#{names.get('counts')}.xml")
      end
    end

    desc 'postproc', 'runs downstream analysis pipeline'
    method_option :annotation,
      :aliases => '-a',
      :banner => 'GTF',
      :desc => 'Path to genomic annotation',
      :required => true,
      :type => :string
    method_option :outdir,
      :aliases => '-o',
      :desc => 'Output directory',
      :required => true,
      :type => :string
    method_option :padj_threshold,
      :aliases => '-p',
      :banner => 'INT',
      :desc => 'Significance threshold for the adjusted p-value',
      :default => 0.05,
      :type => :numeric
    method_option :min_coverage,
      :aliases => '-m',
      :banner => 'INT',
      :desc => 'Minimum number of assignments to a gene across investigated replicates',
      :default => 128,
      :type => :numeric
    method_option :fp_1,
      :banner => 'NAME REP1 REP2 [REP3]',
      :desc => 'Treatment1 Replicates (e.g. mutant)',
      :type => :array
    method_option :fp_2,
      :banner => 'NAME REP1 REP2 [REP3]',
      :desc => 'Treatment2 Replicates (e.g. mutant)',
      :type => :array
    method_option :fp_control,
      :banner => 'NAME REP1 REP2 [REP3]',
      :desc => 'Control Replicates',
      :type => :array
    method_option :mrna_1,
      :banner => 'NAME REP1 REP2 [REP3]',
      :desc => 'Treatment1 Replicates (e.g. mutant)',
      :type => :array
    method_option :mrna_2,
      :banner => 'NAME REP1 REP2 [REP3]',
      :desc => 'Treatment2 Replicates (e.g. mutant)',
      :type => :array
    method_option :mrna_control,
      :banner => 'NAME REP1 REP2 [REP3]',
      :desc => 'Control Replicates',
      :type => :array
    method_option :ribodiff,
      :default => false,
      :desc => 'Run RiboDiff Analysis',
      :type => :boolean
    # runs downstream analysis pipeline
    def postproc
      require 'ribopip/deseq'
      require 'ribopip/expression_analysis'
      require 'ribopip/filenames'
      require 'ribopip/gene_id_2_name'

      ##
      # DESEQ
      fp_experiments =
        [options[:fp_1], options[:fp_2], options[:fp_control]].compact
      mrna_experiments  =
        [options[:mrna_1], options[:mrna_2], options[:mrna_control]].compact
      deseq_results = []

      # Run DESeq for each condition against each other condition
      [fp_experiments, mrna_experiments].each do |experiments|
        experiments.length.times do |i|
          (i+1).upto(experiments.length-1) do |j|
            condition1 = experiments[i]
            condition2 = experiments[j]
            names = Ribopip::Filenames::Deseq.new(
              condition1[0], condition2[0], options[:outdir]
            )
            deseq = Ribopip::ExpressionAnalysis::Deseq::Deseq.new(
              names.dup,
              [condition1[1..-1], condition2[1..-1]],
              [condition1[0],condition2[0]],
              options[:outdir]
            )
            deseq.compute
            deseq_results.push(deseq)
          end
        end
      end

      id2name = GeneID2Name.new(options[:annotation])
      deseq_results.each do |deseq|
        counts = Ribopip::ExpressionAnalysis::Deseq::AddCountsToResults.new(
          deseq.names.get('merged')
        )
        # add counts to unfiltered results
        counts.compute(
          1, deseq.names.get('all', 1), deseq.names.get('allft', 1)
        )
        counts.compute(
          0, deseq.names.get('all', 2), deseq.names.get('allft', 2)
        )
        # add counts to significant results (p-value)
        counts.compute(
          1, deseq.names.get('padj', 1), deseq.names.get('padjft', 1)
        )
        counts.compute(
          0, deseq.names.get('padj', 2), deseq.names.get('padjft', 2)
        )

        counts_norm = Ribopip::ExpressionAnalysis::Deseq::AddCountsToResults.new(
          deseq.names.get('merged'), '_norm', deseq.names.get('log', 1)
        )
        counts_norm.compute(
          1, deseq.names.get('allft', 1), deseq.names.get('allftnorm', 1)
        )
        counts_norm.compute(
          0, deseq.names.get('allft', 2), deseq.names.get('allftnorm', 2)
        )
        counts_norm.compute(
          1, deseq.names.get('padjft', 1), deseq.names.get('padjftnorm', 1)
        )
        counts_norm.compute(
          0, deseq.names.get('padjft', 2), deseq.names.get('padjftnorm', 2)
        )
        counts_norm.write(deseq.names.get('mergednorm', 1))

        # add gene names to gene ids
        id2name.translate(
          1,
          "#{deseq.names.get('padjftnorm', 1)}",
          "#{deseq.names.get('padjnames', 1)}"
        )
        id2name.translate(
          0,
          "#{deseq.names.get('padjftnorm', 2)}",
          "#{deseq.names.get('padjnames', 2)}"
        )
      end

      ##
      # POSTDESEQ
      # Parse significant genes from each experiment of each condition
      fp_experiments.length.times do | i |
        break if deseq_results[fp_experiments.length + i].nil?
        fp_base = deseq_results[i].names.base
        mrna_base = deseq_results[fp_experiments.length + i].names.base
        basenames = [
          Ribopip::Filenames::Postdeseq.new(fp_base),
          Ribopip::Filenames::Postdeseq.new(mrna_base)
        ]

        # copy significant results
        basenames.each do |names|
          run_cmd("cp #{names.get('padjftnorm', 1)} #{names.get('padjcp', 1)}")
          run_cmd("cp #{names.get('padjftnorm', 2)} #{names.get('padjcp', 2)}")
        end

        # add non-significant genes to either FP and total, if they were
        # significant in the other experiment
        important = Ribopip::ExpressionAnalysis::Deseq::ImportantGenes.new(
          basenames
        )
        important.compute

        # add gene names to gene ids
        id2name = GeneID2Name.new(options[:annotation])
        basenames.each do |names|
          id2name.translate(
            1, "#{names.get('padjcp', 1)}", "#{names.get('padjnames', 1)}"
          )
          Ribopip::ExpressionAnalysis::Deseq::Deseq2Excel.convert(
            "#{names.get('padjnames', 1)}", "#{names.get('padjxml', 1)}"
          )
          id2name.translate(
            0, "#{names.get('padjcp', 2)}", "#{names.get('padjnames', 2)}"
          )
          Ribopip::ExpressionAnalysis::Deseq::Deseq2Excel.convert(
            "#{names.get('padjnames', 2)}", "#{names.get('padjxml', 2)}"
          )
        end
      end

      ##
      # Detecting TE changes
      fp_experiments.length.times do |i|
        break if deseq_results[fp_experiments.length + i].nil?
        fp = deseq_results[i]
        mrna = deseq_results[fp_experiments.length + i]
        names_te = Ribopip::Filenames::TE.new(fp.names.base)
        Ribopip::ExpressionAnalysis::Deseq::TE.compute(
          fp.names.get('mergednorm'),
          mrna.names.get('mergednorm'),
          names_te.get('counts')
        )
        deseq_path = File.expand_path(
            '../../../scripts/run_deseq.R', __FILE__
        )

        cmd = "#{deseq_path} 1 #{names_te.get('counts')} 128" \
              " #{fp.conditions[0]} #{fp.conditions[1]} #{fp.reps_num.join(' ')} "\
              " #{names_te.get('all')} #{names_te.get('padj')} " \
              " > #{names_te.get('log')}"
        p "RUN TE: #{cmd}"
        `#{cmd}`

        counts = Ribopip::ExpressionAnalysis::Deseq::AddCountsToResults.new(
          names_te.get('counts'), '_TE'
        )
        counts.compute(1, names_te.get('padj'), names_te.get('padjft'))

        countsfp = Ribopip::ExpressionAnalysis::Deseq::AddCountsToResults.new(
          fp.names.get('merged'), '_fp'
        )
        countsfp.compute(1, names_te.get('padjft'), names_te.get('padjftfp'))

        countsmrna = Ribopip::ExpressionAnalysis::Deseq::AddCountsToResults.new(
          mrna.names.get('merged'), '_mrna'
        )
        countsmrna.compute(
          1, names_te.get('padjftfp'), names_te.get('padjftmrna')
        )

        id2name = GeneID2Name.new(options[:annotation])
        # add gene names to gene ids
        id2name.translate(
          1,
          names_te.get('padjftmrna'),
          names_te.get('padjnames')
        )

        Ribopip::ExpressionAnalysis::Deseq::Deseq2Excel.convert(
          names_te.get('padjnames'), names_te.get('xml')
        )
      end

      ##
      # RIBODIFF and DEseq with multiple factors
      # Run RiboDiff for each condition against each other condition
      fp_experiments.length.times do | i |
        (i+1).upto(fp_experiments.length-1) do |j|
          fp_cond1 = fp_experiments[i]
          fp_cond2 = fp_experiments[j]
          total_cond1 = mrna_experiments[i]
          total_cond2 = mrna_experiments[j]

          # RIBODIFF
          names_rdiff = Ribopip::Filenames::Ribodiff.new(
            fp_cond1[0].gsub(/fp_/, ''),
            fp_cond2[0].gsub(/fp_/, ''),
            options[:outdir]
          )
          rdiff = Ribopip::ExpressionAnalysis::Ribodiff.new(
            names_rdiff.dup,
            [fp_cond1[1..-1], fp_cond2[1..-1], total_cond1[1..-1], total_cond2[1..-1]],
            [fp_cond1[0], fp_cond2[0], total_cond1[0], total_cond2[0]],
            options[:outdir]
          )
          rdiff.compute_exp_outline
          rdiff.merge_featurecounts
          if options[:ribodiff]
            rdiff.compute
            counts = Ribopip::ExpressionAnalysis::Deseq::AddCountsToResults.new(
              names_rdiff.get('merged')
            )
            counts.compute(
              0, names_rdiff.get('padj'), names_rdiff.get('padjft')
            )
            id2name = GeneID2Name.new(options[:annotation])
            # add gene names to gene ids
            id2name.translate(
              0,
              names_rdiff.get('padjft'),
              names_rdiff.get('padjnames')
            )

            Ribopip::ExpressionAnalysis::Deseq::Deseq2Excel.convert(
              names_rdiff.get('padjnames'), names_rdiff.get('xml')
            )
          end

          # DESEQ WITH MULTI-FACTORS
          names_de = Ribopip::Filenames::Deseq.new(
            fp_cond1[0].gsub(/fp_/, ''),
            fp_cond2[0].gsub(/fp_/, ''),
            options[:outdir]
          )
          deseq_path = File.expand_path(
              '../../../scripts/run_deseq_rp.R', __FILE__
          )

          cmd = "#{deseq_path} #{names_rdiff.get('merged')}" \
                " #{names_rdiff.get('outline')} 128"\
                " #{names_de.get('all', '_mult')} "\
                " #{names_de.get('padj', '_mult')} " \
                " > #{names_de.get('log', '_mult')}"
          p "RUN DEseq RP: #{cmd}"
          `#{cmd}`

          counts = Ribopip::ExpressionAnalysis::Deseq::AddCountsToResults.new(
            names_rdiff.get('merged', '_mult')
          )
          counts.compute(
            0, names_de.get('padj', '_mult'), names_de.get('padjft', '_mult')
          )

          id2name = GeneID2Name.new(options[:annotation])
          # add gene names to gene ids
          id2name.translate(
            0,
            names_de.get('padjft', '_mult'),
            names_de.get('padjnames', '_mult')
          )

          Ribopip::ExpressionAnalysis::Deseq::Deseq2Excel.convert(
            names_de.get('padjnames', '_mult'), names_de.get('xml', '_mult')
          )
        end
      end

    end
  end
end