greenelab/adage-server

View on GitHub
adage/analyze/management/commands/import_gene_sample_expr.py

Summary

Maintainability
C
1 day
Test Coverage
#!/usr/bin/env python

"""
This management command reads an input file of gene-sample expression values
and loads the valid data into the database.  It should be invoked like this:

  python manage.py import_gene_sample_expr <expression_filename> \
<organism_tax_id>

The two required arguments are:
  (1) expression_filename: input file of gene-sample expression values;
  (2) organism_tax_id: taxonomy ID of the organism.

For example, to load the sample-gene expression values for the organism
"Pseudomonas aeruginosa" (whose taxonomy ID is 208964), the command will be:
  python manage.py import_gene_sample_expr input_filename 208964

IMPORTANT:
(1) Before running this command, please make sure that "django-organisms"
package has been installed and organism_tax_id already exists in the database.
If organism_tax_id is not in the database yet, please use the management
command "organisms_create_or_update.py" (in "django-organisms" package)
to add it.
(2) If a data source (on the first row of input file) or gene name (on the
first column of input file) is not found in the database, a warning message
will be generated and the corresponding column or row will be skipped.
"""


from __future__ import print_function
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from organisms.models import Organism
from genes.models import Gene
from analyze.models import Sample, ExpressionValue

import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


class Command(BaseCommand):
    help = ("Import gene-sample expression values from an input file.")

    def add_arguments(self, parser):
        parser.add_argument('expression_filename', type=file)
        parser.add_argument('organism_tax_id', type=int)

    def handle(self, **options):
        try:
            import_expr(options['expression_filename'],
                        options['organism_tax_id'])
            self.stdout.write(self.style.NOTICE(
                "Imported gene-sample expression values successfully"))
        except Exception as e:
            raise CommandError(
                "Raised exception when importing gene-sample expression "
                "values: %s" % e)


def import_expr(file_handle, organism_tax_id):
    """
    Function that reads input file and load gene-sample expression values
    into the database.
    """

    # Make sure input organism_tax_id already exists in database.
    try:
        organism = Organism.objects.get(taxonomy_id=organism_tax_id)
    except Organism.DoesNotExist:
        raise Exception("Input organism_tax_id is not found in the database. "
                        "Please use the management command "
                        "'organism_create_or_update.py' in django-organisms "
                        "package to create this organism.")

    # Enclose reading/importing process in a transaction context manager.
    # Any exception raised inside the manager will terminate the transaction
    # and roll back the database.
    with transaction.atomic():
        samples = []
        for line_index, line in enumerate(file_handle):
            tokens = line.rstrip('\r\n').split('\t')
            if line_index == 0:
                tokens = tokens[1:]
                read_header(tokens, samples)
            else:
                import_data_line(line_index + 1, tokens, samples, organism)


def read_header(tokens, samples):
    """
    Read input tokens on header line and save the corresponding sample
    object into "samples". (Each token will be searched in the database
    using "ml_data_source" field. If a token does not match any sample's
    ml_data_source, put None into "samples".)

    An exception will be raised if any of the following errors are detected:
      * Sample token is blank (null or consists of space characters only);
      * Sample token is duplicate;
    """

    token_set = set()
    for index, data_source in enumerate(tokens):
        if not data_source or data_source.isspace():
            raise Exception("Input file line #1 column #%d: blank data_source"
                            % index + 2)
        elif data_source in token_set:
            raise Exception("Input file line #1 column #%d: %s is duplicate" %
                            (index + 2, data_source))
        else:
            try:
                token_set.add(data_source)
                sample = Sample.objects.get(ml_data_source=data_source)
                samples.append(sample)
            except Sample.DoesNotExist:
                samples.append(None)
                logger.warning(
                    "Input file line #1: data_source in column #%d not found "
                    "in the database: %s", index + 2, data_source)


def import_data_line(line_num, tokens, samples, organism):
    """
    Function that imports numerical values in input tokens into the database.
    An exception will be raised if any of the following errors are detected:
      * The number of columns on this line is not equal to the number of
        samples plus 1.
      * The gene's "systematic_name" field (column #1) is blank;
      * Data field (from column #2 to the end) can not be converted into a
        float type.
    """

    if len(tokens) != len(samples) + 1:
        raise Exception("Input file line #%d: Number of columns is not %d" %
                        (line_num, len(samples) + 1))

    gene_name = tokens[0]
    if not gene_name or gene_name.isspace():
        raise Exception("Input file line #%d: gene name (column #1)"
                        " is blank" % line_num)

    try:
        gene = Gene.objects.get(systematic_name=gene_name, organism=organism)
    except Gene.MultipleObjectsReturned:
        raise Exception("Input file line #%d: gene name %s (column #1) matches"
                        " multiple genes in the database" %
                        (line_num, gene_name))
    except Gene.DoesNotExist:
        # If a gene is not found in database, generate a warning message
        # and skip this line.
        logger.warning(
            "Input file line #%d: gene name %s (column #1) not found in "
            "database", line_num, gene_name)
        return

    values = tokens[1:]
    # To speed up the importing process, all expression values on current data
    # line will be saved in "records" and created in bulk at the end.
    records = []
    col_num = 2   # Expression values start from column #2.
    for sample, value in zip(samples, values):
        try:
            float_val = float(value)
        except ValueError:
            raise Exception("Input file line #%d column #%d: expression value "
                            "%s not numeric" % (line_num, col_num, value))
        if sample is not None:
            records.append(
                ExpressionValue(sample=sample, gene=gene, value=float_val))
        col_num += 1
    ExpressionValue.objects.bulk_create(records)  # Create records in bulk.