kg_covid_19/transform_utils/drug_central/drug_central.py from justaddcoffee/kg-emerging-viruses

kg_covid_19/transform_utils/drug_central/drug_central.py
Summary

Maintainability

2 hrs
Test Coverage

87%
Issues
"""Transform class for DrugCentral drug vs target interactions."""

import gzip
import logging
import os
from collections import defaultdict
from typing import Dict, List, Optional

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import (ItemInDictNotFoundError,
                                               data_to_dict,
                                               get_item_by_priority,
                                               parse_header,
                                               write_node_edge_item)

"""
Ingest drug - drug target interactions from Drug Central.

Essentially just ingests and transforms this file:
https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/drug.target.interaction.tsv.gz

And extracts Drug -> Protein interactions.
"""


class DrugCentralTransform(Transform):
    """Transform DrugCentral interaction data."""

    def __init__(
        self, input_dir: Optional[str] = None, output_dir: Optional[str] = None
    ) -> None:
        """Initialize."""
        source_name = "drug_central"
        super().__init__(source_name, input_dir, output_dir)  # set some variables
        self.node_header = ["id", "name", "category", "TDL", "provided_by"]

    def run(
        self, data_file: Optional[str] = None, species: str = "Homo sapiens"
    ) -> None:
        """
        Call method and perform transformations.

        Process the Drug Central data, additional information
        on this data can be found in the comment at the top of this script.
        """
        if data_file is None:
            data_file = "drug.target.interaction.tsv.gz"
        interactions_file = os.path.join(self.input_base_dir, data_file)
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        uniprot_curie_prefix = "UniProtKB:"
        drug_curie_prefix = "DrugCentral:"
        protein_node_type = "biolink:Protein"
        drug_protein_edge_label = "biolink:molecularly_interacts_with"
        drug_protein_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            "subject",
            "predicate",
            "object",
            "relation",
            "provided_by",
            "comment",
            "type",
        ]

        with open(self.output_node_file, "w") as node, open(
            self.output_edge_file, "w"
        ) as edge, gzip.open(interactions_file, "rt") as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            seen_proteins: dict = defaultdict(int)
            seen_drugs: dict = defaultdict(int)

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                if "ORGANISM" not in items_dict or items_dict["ORGANISM"] != species:
                    continue

                # get protein ID
                try:
                    protein_dict = items_dict_to_protein_data_dict(items_dict)

                except ItemInDictNotFoundError:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    continue
                except ValueError:
                    logging.error("Value error while parsing line")
                    continue

                # get drug ID
                drug_id = drug_curie_prefix + get_item_by_priority(
                    items_dict, ["STRUCT_ID"]
                )

                # Write drug node
                if drug_id not in seen_drugs:
                    write_node_edge_item(
                        fh=node,
                        header=self.node_header,
                        data=[
                            drug_id,
                            items_dict["DRUG_NAME"],
                            drug_node_type,
                            "",  # TDL (not applicable for drugs)
                            self.source_name,
                        ],
                    )
                    seen_drugs[drug_id] += 1

                for _, (uniprot_id, name, tdl) in protein_dict.items():
                    protein_id = uniprot_curie_prefix + uniprot_id

                    if protein_id not in seen_proteins:
                        write_node_edge_item(
                            fh=node,
                            header=self.node_header,
                            data=[
                                protein_id,
                                name,
                                protein_node_type,
                                tdl,
                                self.source_name,
                            ],
                        )
                        seen_proteins[protein_id] += 1

                    # WRITE EDGES
                    write_node_edge_item(
                        fh=edge,
                        header=self.edge_header,
                        data=[
                            drug_id,
                            drug_protein_edge_label,
                            protein_id,
                            drug_protein_edge_relation,
                            self.source_name,
                            items_dict["ACT_COMMENT"],
                            "biolink:Association",
                        ],
                    )

        return None


def parse_drug_central_line(this_line: str, header_items: List) -> Dict:
    """Process a line of text from Drug Central.

    Args:
        this_line: A string containing a line of text.
        header_items: A list of header items.
    Returns:
        item_dict: A dictionary of header items and a processed Drug Central string.
    """
    data = this_line.strip().split("\t")
    data = [i.replace('"', "") for i in data]
    item_dict = data_to_dict(header_items, data)

    return item_dict


def items_dict_to_protein_data_dict(items_dict: dict) -> dict:
    """
    Convert an items_dict to a dictionary of protein data.

    Given a parsed line from parse_drug_central_line, split up pipe-separated entries
    for several related proteins and their names and TDL info into separate protein
    entries.
    :param items_dict: dictionary of data from a line, output by parse_drug_central_line
    :return: a dict with information about each protein
    """
    protein_ids_string = get_item_by_priority(items_dict, ["ACCESSION"])
    protein_ids = protein_ids_string.split("|")
    gene_name = get_item_by_priority(items_dict, ["GENE"]).split("|")
    tdl_values = get_item_by_priority(items_dict, ["TDL"]).split("|")

    if len(protein_ids) != len(gene_name):
        logging.warning(
            "Didn't get the same number of entries for protein_ids and gene_ids"
        )
        gene_name = [""] * len(protein_ids)

    if len(protein_ids) != len(tdl_values):
        # this happens - repeat TDL designation for all protein IDs
        tdl_values = tdl_values * len(protein_ids)

    protein_dict = defaultdict(list)
    for i in range(len(protein_ids)):
        protein_dict[protein_ids[i]] = [protein_ids[i], gene_name[i], tdl_values[i]]
    return protein_dict