greenelab/adage-server

View on GitHub
adage/analyze/management/commands/import_activity.py

Summary

Maintainability
C
1 day
Test Coverage
#!/usr/bin/env python

"""
Load activity spreadsheet (generated by Jie) into the database.
This module should be invoked as a management command:

  python manage.py import_activity <activity_filename> <ml_model_name>

The two required arguments of this commands are:
  (1) activity_filename: a tab-delimited activity spreadsheet;
  (2) ml_model_name: machine learning model's name that corresponds to
      activity_filename;

IMPORTANT:
Before running this command, please make sure that ml_model_name already
exists in the database.  If it doesn't, you can use the management
command "add_ml_model.py" to add it into the database.
"""

from __future__ import print_function
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from analyze.models import Sample, MLModel, Signature, Activity

import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


class Command(BaseCommand):
    help = ("Import activity data from an input  spreadsheet.")

    def add_arguments(self, parser):
        parser.add_argument('activity_file', type=file)
        parser.add_argument('ml_model_name', type=str)

    def handle(self, **options):
        try:
            import_activity(options['activity_file'],
                            options['ml_model_name'])
            self.stdout.write(self.style.NOTICE(
                "Imported activity data successfully"))
        except Exception as e:
            raise CommandError(
                "Failed to import activity data: import_activity raised "
                "an exception:\n%s" % e)


def import_activity(file_handle, ml_model_name):
    """
    Read the data in activity sheet into the database.
    This function first checks whether ml_model_name exists in the
    database, then call import_signatures() and import_activity_line()
    to populate "Signature" and "Activity" tables in the database.
    """

    # Raise an exception if ml_model_name doesn't exist in the database.
    try:
        mlmodel = MLModel.objects.get(title=ml_model_name)
    except MLModel.DoesNotExist:
        raise Exception("Input ml_model_name %s does not exist in the database"
                        % ml_model_name)

    # Enclose reading/importing process in a transaction context
    # manager.  Any exception raised inside the manager will
    # terminate the transaction and roll back the database.
    with transaction.atomic():
        signatures = []
        for line_index, line in enumerate(file_handle):
            tokens = line.rstrip('\r\n').split('\t')
            if line_index == 0:
                signatures = tokens[1:]
                import_signatures(signatures, mlmodel)
            else:
                import_activity_line(line_index + 1, signatures, tokens,
                                     mlmodel)


def import_signatures(signatures, mlmodel):
    """
    Load input signatures into "Signature" table in the database.

    This function will raise an exception if any of the following errors
    are detected:
      * Signature name is blank (null or consists of space characters only);
      * Signature name is duplicate;
      * The combination of Signature name and given ml_model_name is not
        unique.
    """
    signature_set = set()
    for index, name in enumerate(signatures):
        if not name or name.isspace():
            raise Exception(
                "Input file line #1 column #%d: blank signature name" %
                index + 2)
        elif name in signature_set:
            raise Exception("Input file line #1 column #%d: %s is NOT unique" %
                            (index + 2, name))
        elif Signature.objects.filter(name=name, mlmodel=mlmodel).exists():
            raise Exception("Input file line #1 column #%d: Signature name %s "
                            "already exists in Signature table"
                            % (index + 2, name))
        else:
            signature_set.add(name)
            Signature.objects.create(name=name, mlmodel=mlmodel)


def import_activity_line(line_num, signatures, tokens, mlmodel):
    """
    Load numerical values in input tokens into "Activity" table.

    This function will raise an exception if any of the following errors
    are detected on the data line:
      * The number of columns on this line is not equal to the number of
        signatures plus 1.
      * The data source field (in column #1) is blank;
      * Any field from column #2 to the end can not be converted into a
        float type.
    """
    if len(tokens) != len(signatures) + 1:
        raise Exception("Input file line #%d: Number of columns is not %d" %
                        (line_num, len(signatures) + 1))

    data_source = tokens[0]
    if not data_source or data_source.isspace():
        raise Exception("Input file line #%d: column #1 (data_source) is blank"
                        % line_num)

    try:
        sample = Sample.objects.get(ml_data_source=data_source)
    except Sample.DoesNotExist:
        # If data_source on the line is not found in Sample table, then
        # instead of raising an exception, generate a warning message
        # and skip this activity data line.
        logger.warn(
            "Input file line #%d: data_source in column #1 is not found in "
            "the database: %s", line_num, data_source)
        return

    values = tokens[1:]
    # In order to speed up the import, all activity records on the same
    # line will be saved in "records" and created in bulk at the end.
    records = []
    col_num = 2   # The numerical values start from column #2.
    for signature_name, value in zip(signatures, values):
        try:
            float_val = float(value)
        except ValueError:
            raise Exception("Input file line #%d column #%d: %s can not be "
                            "converted into a float type" %
                            (line_num, col_num, value))

        signature = Signature.objects.get(name=signature_name, mlmodel=mlmodel)
        records.append(
            Activity(sample=sample, signature=signature, value=float_val)
        )
        col_num += 1
    Activity.objects.bulk_create(records)  # Create records in bulk.