bel/nanopub/belscripts.py from belbio/bel

bel/nanopub/belscripts.py
Summary

Maintainability

1 day
Test Coverage

Issues
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-

"""
Process belscripts content into nanopubs_bel-1.0.0 format for use in BEL.bio

BEL Script format documented here:

    https://wiki.openbel.org/display/BFD/BEL+Script+V2.0+Format

Notes:
    * Don't check that document and definition sections are at top of doc
    *
"""

# Standard Library
import collections
import copy
import csv
import gzip
import json
import re
import sys

# Third Party
import yaml
from loguru import logger

# citation fields are document type, a document name, a document reference ID, and an optional publication date, authors list and comment field

nanopub_type = {"name": "BEL", "version": "2.0.0"}


def convert_csv_str_to_list(csv_str: str) -> list:
    """Convert CSV str to list"""

    csv_str = re.sub("^\s*{", "", csv_str)
    csv_str = re.sub("}\s*$", "", csv_str)
    r = csv.reader([csv_str])
    row = list(r)[0]
    new = []
    for col in row:
        col = re.sub('^\s*"?\s*', "", col)
        col = re.sub('\s*"?\s*$', "", col)
        new.append(col)

    return new


def process_citation(citation_str: str) -> dict:
    """Parse BEL Script Citation string into nanopub_bel citation object"""

    citation_obj = {}

    citation_list = convert_csv_str_to_list(citation_str)
    (citation_type, name, doc_id, pub_date, authors, comment, *extra) = citation_list + [None] * 7
    # print(f'citation_type: {citation_type}, name: {name}, doc_id: {doc_id}, pub_date: {pub_date}, authors: {authors}, comment: {comment}')

    authors_list = []
    if authors:
        authors_list = authors.split("|")
        citation_obj["authors"] = authors_list

    if name and re.match("http?://", name):
        citation_obj["uri"] = name

    elif citation_type and citation_type.upper() == "PUBMED":
        citation_obj["database"] = {"name": "PubMed", "id": doc_id}
        if name:
            citation_obj["reference"] = name
    elif name:
        citation_obj["reference"] = name

    else:
        citation_obj["reference"] = "No reference found"

    if pub_date:
        citation_obj["date_published"] = pub_date

    if comment:
        citation_obj["comment"] = comment

    return citation_obj


def yield_metadata(nanopubs_metadata):
    """Yield nanopub metadata collected from BEL Script"""

    return {"metadata": copy.deepcopy(nanopubs_metadata)}


def split_bel_stmt(stmt: str, line_num) -> tuple:
    """Split bel statement into subject, relation, object tuple"""

    m = re.match(f"^(.*?\))\s+([a-zA-Z=\->\|:]+)\s+([\w(]+.*?)$", stmt, flags=0)
    if m:
        return (m.group(1), m.group(2), m.group(3))
    else:
        logger.info(
            f"Could not parse bel statement into components at line number: {line_num} assertion: {stmt}"
        )
        return (stmt, None, None)


def yield_nanopub(assertions, annotations, line_num):
    """Yield nanopub object"""

    if not assertions:
        return {}

    anno = copy.deepcopy(annotations)

    evidence = anno.pop("evidence", None)
    stmt_group = anno.pop("statement_group", None)
    citation = anno.pop("citation", None)

    anno_list = []
    for anno_type in anno:
        if isinstance(anno[anno_type], (list, tuple)):
            for val in anno[anno_type]:
                anno_list.append({"type": anno_type, "label": val})
        else:
            anno_list.append({"type": anno_type, "label": anno[anno_type]})

    assertions_list = []
    for assertion in assertions:
        (subj, rel, obj) = split_bel_stmt(assertion, line_num)
        assertions_list.append({"subject": subj, "relation": rel, "object": obj})

    nanopub = {
        "schema_uri": "https://raw.githubusercontent.com/belbio/schemas/master/schemas/nanopub_bel-1.0.0.yaml",
        "type": copy.deepcopy(nanopub_type),
        "annotations": copy.deepcopy(anno_list),
        "citation": copy.deepcopy(citation),
        "assertions": copy.deepcopy(assertions_list),
        "evidence": evidence,
        "metadata": {"statement_group": stmt_group},
    }

    return {"nanopub": copy.deepcopy(nanopub)}


def process_documentline(line, nanopubs_metadata):
    """Process SET DOCUMENT line in BEL script"""

    matches = re.match('SET DOCUMENT\s+(\w+)\s+=\s+"?(.*?)"?$', line)
    key = matches.group(1)
    val = matches.group(2)
    nanopubs_metadata[key] = val

    return nanopubs_metadata


def process_definition(line, nanopubs_metadata):
    """Process DEFINE line in BEL script"""

    matches = re.match('DEFINE\s+(\w+)\s+(\w+)\s+AS\s+URL\s+"(.*?)"\s*$', line)
    if matches:
        def_type = matches.group(1).lower()
        if def_type == "namespace":
            def_type = "namespaces"
        elif def_type == "annotation":
            def_type == "annotations"

        key = matches.group(2)
        val = matches.group(3)

        if def_type in nanopubs_metadata:
            nanopubs_metadata[def_type][key] = val
        else:
            nanopubs_metadata[def_type] = {key: val}

    matches = re.match("DEFINE\s+(\w+)\s+(\w+)\s+AS\s+LIST\s+{(.*?)}\s*$", line)
    if matches:
        def_type = matches.group(1).lower()
        if def_type == "namespace":
            def_type = "namespaces"
        elif def_type == "annotation":
            def_type == "annotations"

        key = matches.group(2)
        val = matches.group(3)
        vals = convert_csv_str_to_list(val)

        if def_type in nanopubs_metadata:
            nanopubs_metadata[def_type][key] = vals
        else:
            nanopubs_metadata[def_type] = {key: vals}

    return nanopubs_metadata


def process_unset(line, annotations):
    """Process UNSET lines in BEL Script"""

    matches = re.match('UNSET\s+"?(.*?)"?\s*$', line)
    if matches:
        val = matches.group(1)
        if val == "ALL" or val == "STATEMENT_GROUP":
            annotations = {}
        elif re.match("{", val):
            vals = convert_csv_str_to_list(val)
            for val in vals:
                annotations.pop(val, None)
        else:
            annotations.pop(val, None)

    else:
        logger.warn(f"Problem with UNSET line: {line}")

    return annotations


def process_set(line, annotations):
    """Convert annotations into nanopub_bel annotations format"""

    matches = re.match('SET\s+(\w+)\s*=\s*"?(.*?)"?\s*$', line)

    key = None
    if matches:
        key = matches.group(1)
        val = matches.group(2)

    if key == "STATEMENT_GROUP":
        annotations["statement_group"] = val
    elif key == "Citation":
        annotations["citation"] = process_citation(val)
    elif key.lower() == "support" or key.lower() == "evidence":
        annotations["evidence"] = val
    elif re.match("\s*{.*?}", val):
        vals = convert_csv_str_to_list(val)
        annotations[key] = vals
    else:
        annotations[key] = val

    return annotations


def set_single_line(lines):

    flag = False
    hold = ""

    for line in lines:
        if flag and re.match('.*"', line):
            line = hold + " " + line
            flag = False
            line = re.sub("\s+", " ", line)
            yield line
        elif flag:
            hold += " " + line.rstrip()

        elif re.match('SET\s*\w+\s*=\s*".*"', line):
            line = re.sub("\s+", " ", line)
            yield line
        elif re.match('SET\s*\w+\s*=\s*".*', line):
            hold = line.rstrip()
            flag = True
        else:
            line = re.sub("\s+", " ", line)
            yield line


def preprocess_belscript(lines):
    """ Convert any multi-line SET statements into single line SET statements"""

    set_flag = False
    for line in lines:
        if set_flag is False and re.match("SET", line):
            set_flag = True
            set_line = [line.rstrip()]
        # SET following SET
        elif set_flag and re.match("SET", line):
            yield f"{' '.join(set_line)}\n"
            set_line = [line.rstrip()]
        # Blank line following SET yields single line SET
        elif set_flag and re.match("\s+$", line):
            yield f"{' '.join(set_line)}\n"
            yield line
            set_flag = False

        # Append second, third, ... lines to SET
        elif set_flag:
            set_line.append(line.rstrip())
        else:
            yield line


def parse_belscript(lines):
    """Lines from the BELScript - can be an iterator or list

    yields Nanopubs in nanopubs_bel-1.0.0 format
    """

    nanopubs_metadata = {}
    annotations = {}
    assertions = []

    # # Turn a list into an iterator
    # if not isinstance(lines, collections.Iterator):
    #     lines = iter(lines)

    line_num = 0

    # for line in preprocess_belscript(lines):
    for line in set_single_line(lines):

        line_num += 1
        # Get rid of trailing comments
        line = re.sub("\/\/.*?$", "", line)

        line = line.rstrip()

        # Collapse continuation lines
        while re.search("\\\s*$", line):
            line = line.replace("\\", "") + next(lines)

        # Process lines #################################
        if re.match("\s*#", line) or re.match("\s*$", line):
            # Skip comments and empty lines
            continue
        elif re.match("SET DOCUMENT", line):
            nanopubs_metadata = process_documentline(line, nanopubs_metadata)
        elif re.match("DEFINE", line):
            nanopubs_metadata = process_definition(line, nanopubs_metadata)
        elif re.match("UNSET", line):

            # Process any assertions prior to changing annotations
            if assertions:
                yield yield_nanopub(assertions, annotations, line_num)
                assertions = []
            annotations = process_unset(line, annotations)

        elif re.match("SET", line):
            # Create nanopubs metadata prior to starting BEL Script statements section
            if nanopubs_metadata:
                yield yield_metadata(nanopubs_metadata)
                nanopubs_metadata = {}

            # Process any assertions prior to changing annotations
            if assertions:
                yield yield_nanopub(assertions, annotations, line_num)
                assertions = []

            annotations = process_set(line, annotations)

        else:
            assertions.append(line)

    # Catch any leftover bel statements
    yield_nanopub(assertions, annotations, line_num)


def main():

    with open("test.v2.bel", "r") as f:
        for doc in parse_belscript(f):
            print(json.dumps(doc, indent=4))

    quit()

    bel = 'proteinAbundance(HGNC:VHL) increases (proteinAbundance(HGNC:TNF) increases biologicalProcess(GOBP:"cell death"))'
    print(split_bel_stmt(bel))
    quit()

    citation_str = 'SET Citation = {"PubMed","Proc Natl Acad Sci U S A 1999 Feb 16 96(4) 1603-8","9990071","","",""}'
    print(process_citation(citation_str))
    quit()


if __name__ == "__main__":
    main()