bio-miga/miga

View on GitHub
scripts/ssu.bash

Summary

Maintainability
Test Coverage
#!/bin/bash
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES, $DATASET
set -e
SCRIPT="ssu"
# shellcheck source=scripts/miga.bash
. "$MIGA/scripts/miga.bash" || exit 1
DIR="$PROJECT/data/07.annotation/01.function/02.ssu"
[[ -d "$DIR" ]] || mkdir -p "$DIR"
cd "$DIR"

# Initialize
miga date > "$DATASET.start"

fa="../../../05.assembly/$DATASET.LargeContigs.fna"
if [[ -s $fa ]] ; then
  # Get domain
  d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
  if [[ "$d" != "Bacteria" && "$d" != "Archaea" && "$d" != "Eukaryota" ]] ; then
    d="Bacteria" # Assume Bacteria in the absence of additional information
  fi

  # Run barrnap
  dom_opt="$(echo "$d" | perl -ne 'print lc' | head -c 3)"
  barrnap --quiet --kingdom "$dom_opt" --threads "$CORES" "$fa" \
    > "${DATASET}.gff"

  # Extract
  grep "^##gff\\|;product=16S " < "${DATASET}.gff" \
    | bedtools getfasta -s "-fi" "$fa" -bed /dev/stdin \
      -fo "${DATASET}.ssu.all.fa"
  FastA.length.pl "${DATASET}.ssu.all.fa" | sort -nr -k 2 | head -n 1 \
    | cut -f 1 > "${DATASET}.ssu.fa.id"
  FastA.filter.pl "${DATASET}.ssu.fa.id" "${DATASET}.ssu.all.fa" \
    > "${DATASET}.ssu.fa"
  rm "${DATASET}.ssu.fa.id"
  [[ -e "${fa}.fai" ]] && rm "${fa}.fai"

  # RDP classifier
  if [[ "$MIGA_RDP" == "yes" && -s "${DATASET}.ssu.all.fa" ]] ; then
    java -jar "$MIGA_HOME/.miga_db/classifier.jar" classify \
      -c 0.8 -f fixrank -g 16srrna -o "${DATASET}.rdp.tsv" \
      "${DATASET}.ssu.all.fa"
    echo "# Version: $(perl -pe 's/.*://' \
          < "$MIGA_HOME/.miga_db/classifier.version.txt" \
          | grep . | paste - - | perl -pe 's/\t/; /')" \
      >> "${DATASET}.rdp.tsv"
  fi

  # tRNAscan-SE
  dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
  out="${DATASET}.trna.txt"
  # `echo O` is to avoid a hang from a pre-existing output file.
  # This is better than pre-checking (and removing), because it avoids
  # the (unlikely) scenario of a file racing (e.g., a file created right
  # before tRNAscan-SE starts, or a `rm` failure).
  #
  # The trailing `|| true` is to treat failure as non-fatal
  echo O | tRNAscan-SE $dom_opt -o "${DATASET}.trna.txt" -q "$fa" || true

  # Gzip
  for x in gff ssu.all.fa rdp.tsv trna.txt ; do
    [[ -e "${DATASET}.${x}" ]] && gzip -9 -f "${DATASET}.${x}"
  done
fi

# Finalize
miga date > "${DATASET}.done"
cat <<VERSIONS \
  | miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f --stdin-versions
=> MiGA
$(miga --version)
$(
  if [[ -s $fa ]] ; then
    echo "=> barrnap"
    barrnap --version 2>&1 | perl -pe 's/^barrnap //'
    echo "=> bedtools"
    bedtools --version 2>&1 | perl -pe 's/^bedtools //'
    echo "=> Enveomics Collection"
    echo "version unknown"
    echo "=> RDP Naive Bayes Classifier"
    gzip -cd "${DATASET}.rdp.tsv.gz" | tail -n 1 | perl -pe 's/.*: //'
    echo "=> tRNAscan-SE"
    tRNAscan-SE -h 2>&1 | head -n 2 | tail -n 1 | perl -pe 's/^tRNAscan-SE //'
  fi
)
VERSIONS