examples/modomics.py from KarrLab/bpforms

examples/modomics.py
Summary

Maintainability

6 days
Test Coverage

Issues
""" Generate BpForms for all of the tRNA in MODOMICS, verify
them, and calculate their properties

:Author: Jonathan Karr <karr@mssm.edu>
:Date: 2019-06-20
:Copyright: 2019, Karr Lab
:License: MIT
"""

from matplotlib import pyplot
import bpforms
import bs4
import csv
import matplotlib
import numpy
import os
import requests
import requests_cache

URL = 'https://iimcb.genesilico.pl/modomics/sequences/'


def run():
    # create dict of MODOMICS single character monomer codes
    modomics_short_code_to_monomer = {}
    for monomer in bpforms.rna_alphabet.monomers.values():
        for identifier in monomer.identifiers:
            if identifier.ns == 'modomics.short_name':
                modomics_short_code_to_monomer[identifier.id] = monomer
    modomics_short_code_to_monomer['a'] = bpforms.rna_alphabet.monomers.get('A')
    modomics_short_code_to_monomer['c'] = bpforms.rna_alphabet.monomers.get('C')
    modomics_short_code_to_monomer['g'] = bpforms.rna_alphabet.monomers.get('G')
    modomics_short_code_to_monomer['u'] = bpforms.rna_alphabet.monomers.get('U')
    modomics_short_code_to_monomer['b'] = bpforms.rna_alphabet.monomers.get('0522U')
    modomics_short_code_to_monomer['B'] = bpforms.rna_alphabet.monomers.get('0C')
    modomics_short_code_to_monomer['E'] = bpforms.rna_alphabet.monomers.get('662A')
    modomics_short_code_to_monomer['h'] = bpforms.rna_alphabet.monomers.get('21511U')
    # modomics_short_code_to_monomer['H'] = bpforms.rna_alphabet.monomers.get('0C')
    modomics_short_code_to_monomer['J'] = bpforms.rna_alphabet.monomers.get('0U')
    modomics_short_code_to_monomer['l'] = bpforms.rna_alphabet.monomers.get('253U')
    modomics_short_code_to_monomer['L'] = bpforms.rna_alphabet.monomers.get('2G')
    modomics_short_code_to_monomer['K'] = bpforms.rna_alphabet.monomers.get('1G')
    modomics_short_code_to_monomer['M'] = bpforms.rna_alphabet.monomers.get('42C')
    # modomics_short_code_to_monomer['N'] = bpforms.rna_alphabet.monomers.get('?U')
    modomics_short_code_to_monomer['P'] = bpforms.rna_alphabet.monomers.get('9U')
    modomics_short_code_to_monomer['R'] = bpforms.rna_alphabet.monomers.get('22G')
    modomics_short_code_to_monomer['T'] = bpforms.rna_alphabet.monomers.get('5U')
    modomics_short_code_to_monomer['Z'] = bpforms.rna_alphabet.monomers.get('09U')
    modomics_short_code_to_monomer['7'] = bpforms.rna_alphabet.monomers.get('7G')
    modomics_short_code_to_monomer['#'] = bpforms.rna_alphabet.monomers.get('0G')
    modomics_short_code_to_monomer[':'] = bpforms.rna_alphabet.monomers.get('0A')
    modomics_short_code_to_monomer['='] = bpforms.rna_alphabet.monomers.get('6A')
    modomics_short_code_to_monomer['?'] = bpforms.rna_alphabet.monomers.get('5C')
    modomics_short_code_to_monomer['λ'] = bpforms.rna_alphabet.monomers.get('04C')
    modomics_short_code_to_monomer['"'] = bpforms.rna_alphabet.monomers.get('1A')
    modomics_short_code_to_monomer["'"] = bpforms.rna_alphabet.monomers.get('3C')
    modomics_short_code_to_monomer[','] = bpforms.rna_alphabet.monomers.get('522U')
    modomics_short_code_to_monomer['\\'] = bpforms.rna_alphabet.monomers.get('05U')
    modomics_short_code_to_monomer['ℑ'] = bpforms.rna_alphabet.monomers.get('00G')
    modomics_short_code_to_monomer[']'] = bpforms.rna_alphabet.monomers.get('19U')
    modomics_short_code_to_monomer['ˆ'] = bpforms.rna_alphabet.monomers.get('00A')
    modomics_short_code_to_monomer['gluQtRNA'] = bpforms.rna_alphabet.monomers.get('105G')
    modomics_short_code_to_monomer['m22G'] = bpforms.rna_alphabet.monomers.get('22G')
    # modomics_short_code_to_monomer[';'] = bpforms.rna_alphabet.monomers.get('?G')
    # modomics_short_code_to_monomer['<'] = bpforms.rna_alphabet.monomers.get('?C')

    # create cache for web queries
    cache_name = os.path.join('examples', 'modomics')
    session = requests_cache.core.CachedSession(cache_name, backend='sqlite', expire_after=None)
    session.mount(URL, requests.adapters.HTTPAdapter(max_retries=5))

    # parse rRNA and tRNA data
    monomer_codes = {}
    rrna_forms = run_rrna(session, modomics_short_code_to_monomer, monomer_codes,
                             os.path.join('examples', 'modomics.rrna.tsv'))
    trna_forms, trna_canonical_code_freq, trna_code_freq = run_trna(
        session, modomics_short_code_to_monomer, monomer_codes, os.path.join('examples', 'modomics.trna.tsv'))

    # plot distribution of tRNA monomeric forms
    plot_trna_code_freq(monomer_codes, trna_code_freq)

    # return results
    return rrna_forms, trna_forms, trna_canonical_code_freq, trna_code_freq


def run_rrna(session, modomics_short_code_to_monomer, monomer_codes, out_filename):
    response = session.get(URL, params={
        'RNA_type': 'rRNA',
        'RNA_subtype': 'all',
        'organism': 'all species',
        'vis_type': 'Modomics symbols',
    })

    response.raise_for_status()

    doc = bs4.BeautifulSoup(response.text, 'lxml')
    table = doc.find('table', {'id': 'tseq'})
    tbody = table.find('tbody')
    rows = tbody.find_all('tr')
    rna_forms = []
    for row in rows:
        if not isinstance(row, bs4.element.Tag):
            continue

        cells = row.find_all('td')

        rna_form = bpforms.RnaForm()
        unsupported_codes = set()
        for child in cells[5].children:
            if child.name is None or child.name == 'span':
                if child.name is None:
                    text = str(child)
                else:
                    text = child.text

                for code in text.strip().replace('-', '').replace('_', ''):
                    monomer = modomics_short_code_to_monomer.get(code, None)
                    if monomer is None:
                        unsupported_codes.add(code)
                        monomer = bpforms.Monomer(id=code)
                    else:
                        monomer_codes[code] = monomer
                    rna_form.seq.append(monomer)
            elif child.name == 'a':
                code = child.get('href').replace('/modomics/modifications/', '')
                monomer = modomics_short_code_to_monomer.get(code, None)
                if monomer is None:
                    unsupported_codes.add(code)
                    monomer = bpforms.Monomer(id=code)
                else:
                    monomer_codes[code] = monomer
                rna_form.seq.append(monomer)
            else:
                raise Exception('Unsupported child {}'.format(child.name))

        rna_forms.append({
            'GenBank': cells[0].find('a').text.strip(),
            'Organism': cells[3].text.strip(),
            'Organellum': cells[4].text.strip(),
            'Type': cells[2].text.strip(),
            'Sequence (MODOMICS)': cells[5].text.strip().replace('-', '').replace('_', ''),
        })
        analyze_form(rna_form, unsupported_codes, rna_forms[-1])

    # save results to tab-separated file
    save_results(rna_forms, ['GenBank', 'Type'], out_filename)

    return rna_forms


def run_trna(session, modomics_short_code_to_monomer, monomer_codes, out_filename):
    response = session.get(URL, params={
        'RNA_type': 'tRNA',
        'RNA_subtype': 'all',
        'organism': 'all species',
        'vis_type': 'Modomics symbols',
    })
    response.raise_for_status()

    doc = bs4.BeautifulSoup(response.text, 'lxml')
    table = doc.find('table', {'id': 'tseq'})
    tbody = table.find('tbody')
    rows = tbody.find_all('tr')
    rna_forms = []

    code_freq = {}
    canonical_code_freq = {'A': 0, 'C': 0, 'G': 0, 'U': 0}
    for row in rows:
        cells = row.find_all('td')

        rna_form = bpforms.RnaForm()
        unsupported_codes = set()
        for child in cells[5].children:
            if child.name is None or child.name == 'span':
                if child.name is None:
                    text = str(child)
                else:
                    text = child.text

                for code in text.strip().replace('-', '').replace('_', ''):
                    monomer = modomics_short_code_to_monomer.get(code, None)
                    if monomer is None:
                        unsupported_codes.add(code)
                        monomer = bpforms.Monomer(id=code)
                    else:
                        monomer_codes[code] = monomer
                        if code not in code_freq:
                            code_freq[code] = 0
                        code_freq[code] += 1
                    rna_form.seq.append(monomer)
            elif child.name == 'a':
                code = child.get('href').replace('/modomics/modifications/', '')
                monomer = modomics_short_code_to_monomer.get(code, None)
                if monomer is None:
                    unsupported_codes.add(code)
                    monomer = bpforms.Monomer(id=code)
                else:
                    monomer_codes[code] = monomer
                    if code not in code_freq:
                        code_freq[code] = 0
                    code_freq[code] += 1
                rna_form.seq.append(monomer)
            else:
                raise Exception('Unsupported child {}'.format(child.name))

        rna_forms.append({
            'Amino acid type': cells[1].text.strip(),
            'Anticodon': cells[2].text.strip(),
            'Organism': cells[3].text.strip(),
            'Organellum': cells[4].text.strip(),
            'Sequence (MODOMICS)': cells[5].text.strip().replace('-', '').replace('_', ''),
        })
        analyze_form(rna_form, unsupported_codes, rna_forms[-1])

        canonical_code_freq['A'] += \
            rna_forms[-1]['Sequence (IUPAC)'].count('A') \
            - rna_form.seq.count(bpforms.rna_alphabet.monomers.A)
        canonical_code_freq['C'] += \
            rna_forms[-1]['Sequence (IUPAC)'].count('C') \
            - rna_form.seq.count(bpforms.rna_alphabet.monomers.C)
        canonical_code_freq['G'] += \
            rna_forms[-1]['Sequence (IUPAC)'].count('G') \
            - rna_form.seq.count(bpforms.rna_alphabet.monomers.G)
        canonical_code_freq['U'] += \
            rna_forms[-1]['Sequence (IUPAC)'].count('U') \
            - rna_form.seq.count(bpforms.rna_alphabet.monomers.U)

    # save results to tab-separated file
    save_results(rna_forms, ['Amino acid type', 'Anticodon'], out_filename)

    with open(os.path.join('examples', 'modomics.trna.canonical-code-freq.tsv'), 'w') as file:
        writer = csv.DictWriter(file, fieldnames=['Code', 'Frequency'], dialect='excel-tab')
        writer.writeheader()
        for code, freq in canonical_code_freq.items():
            writer.writerow({'Code': code, 'Frequency': freq})

    with open(os.path.join('examples', 'modomics.trna.code-freq.tsv'), 'w') as file:
        writer = csv.DictWriter(file, fieldnames=['Code', 'Frequency'], dialect='excel-tab')
        writer.writeheader()
        for code, freq in code_freq.items():
            writer.writerow({'Code': code, 'Frequency': freq})

    return rna_forms, canonical_code_freq, code_freq


def analyze_form(rna_form, unsupported_codes, results_dict):
    results_dict['Sequence (BpForms)'] = str(rna_form)
    results_dict['Sequence (IUPAC)'] = canonical_seq = rna_form.get_canonical_seq()
    results_dict['Length'] = len(rna_form.seq)

    results_dict['Number of modifications'] = len(rna_form.seq) \
        - rna_form.seq.count(bpforms.rna_alphabet.monomers.A) \
        - rna_form.seq.count(bpforms.rna_alphabet.monomers.C) \
        - rna_form.seq.count(bpforms.rna_alphabet.monomers.G) \
        - rna_form.seq.count(bpforms.rna_alphabet.monomers.U)
    results_dict['Number of modified A'] = canonical_seq.count('A') - rna_form.seq.count(bpforms.rna_alphabet.monomers.A)
    results_dict['Number of modified C'] = canonical_seq.count('C') - rna_form.seq.count(bpforms.rna_alphabet.monomers.C)
    results_dict['Number of modified G'] = canonical_seq.count('G') - rna_form.seq.count(bpforms.rna_alphabet.monomers.G)
    results_dict['Number of modified U'] = canonical_seq.count('U') - rna_form.seq.count(bpforms.rna_alphabet.monomers.U)

    if unsupported_codes:
        results_dict['BpForms errors'] = 'MODOMICS sequence uses monomeric forms {}'.format(
            ', '.join(unsupported_codes))
    else:
        results_dict['Formula'] = str(rna_form.get_formula())
        results_dict['Molecular weight'] = rna_form.get_mol_wt()
        results_dict['Charge'] = rna_form.get_charge()

        canonical_form = bpforms.RnaForm().from_str(canonical_seq)
        results_dict['Canonical formula'] = str(canonical_form.get_formula())
        results_dict['Canonical molecular weight'] = canonical_form.get_mol_wt()
        results_dict['Canonical charge'] = canonical_form.get_charge()

        results_dict['Extra formula'] = str(rna_form.get_formula() - canonical_form.get_formula())
        results_dict['Extra molecular weight'] = rna_form.get_mol_wt() - canonical_form.get_mol_wt()
        results_dict['Extra charge'] = rna_form.get_charge() - canonical_form.get_charge()

        results_dict['BpForms errors'] = ' '.join(rna_form.validate())


def save_results(rna_forms, variable_field_names, out_filename):
    with open(out_filename, 'w') as file:
        writer = csv.DictWriter(file,
                                fieldnames=variable_field_names + [
                                    'Organism', 'Organellum',
                                    'Sequence (MODOMICS)',
                                    'Sequence (BpForms)',
                                    'Sequence (IUPAC)',
                                    'Length',
                                    'Number of modifications',
                                    'Number of modified A',
                                    'Number of modified C',
                                    'Number of modified G',
                                    'Number of modified U',
                                    'Formula', 'Molecular weight', 'Charge',
                                    'Canonical formula', 'Canonical molecular weight', 'Canonical charge',
                                    'Extra formula', 'Extra molecular weight', 'Extra charge',
                                    'BpForms errors'],
                                dialect='excel-tab')
        writer.writeheader()

        for rna_form in rna_forms:
            writer.writerow(rna_form)


def plot_trna_code_freq(monomer_codes, trna_code_freq):
    monomer_ids = {}
    for code, monomer in bpforms.rna_alphabet.monomers.items():
        monomer_ids[monomer] = code

    trna_canonical_code_freq = {'A': 0, 'C': 0, 'G': 0, 'U': 0}
    for code, count in trna_code_freq.items():
        if code in ['A', 'C', 'G', 'U']:
            continue
        canonical_code = monomer_codes[code].get_canonical_code(monomer_ids)
        trna_canonical_code_freq[canonical_code] += count

    pyplot.style.use('ggplot')
    fig, axes = pyplot.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [1, 16]})
    fig.set_size_inches(9.3, 1.5)
    plot_codes(trna_canonical_code_freq, monomer_codes,
               axes[0], ignore_canonical=False, title='Frequency of modifications')
    plot_codes(trna_code_freq, monomer_codes,
               axes[1], ignore_canonical=True, title='Frequency of modified monomeric forms')
    if not os.path.isdir(os.path.join('bpforms', 'web', 'img', 'example')):
        os.makedirs(os.path.join('bpforms', 'web', 'img', 'example'))
    fig.savefig(os.path.join('bpforms', 'web', 'img', 'example', 'modomics.trna.code-freq.svg'),
                transparent=True,
                bbox_inches=matplotlib.transforms.Bbox([[0.4, -0.1], [8.35, 1.5]]))
    pyplot.close(fig)


def plot_codes(code_freq, monomer_codes, axis, ignore_canonical=False, use_rel=True,
               title=None, x_axis_label=None, y_axis_label='Frequency (%)',
               axis_font_size=10, tick_font_size=6, font_family='Raleway',
               x_label_pad=None):
    monomer_ids = {}
    for code, monomer in bpforms.rna_alphabet.monomers.items():
        monomer_ids[monomer] = code

    id_freqs = []
    for code, count in code_freq.items():
        if ignore_canonical and code in ['A', 'C', 'G', 'U']:
            continue
        id_freqs.append((monomer_codes[code].get_canonical_code(monomer_ids),
                         monomer_codes[code].id,
                         count))
    id_freqs.sort()

    y_pos = numpy.arange(len(id_freqs))
    freq = numpy.array([id_freq[2] for id_freq in id_freqs])
    if use_rel:
        freq = freq / numpy.sum(freq) * 100.
    x_tick_labels = {id: y_pos for y_pos, (_, id, _) in enumerate(id_freqs)}

    bars = axis.bar(y_pos, freq, align='center', alpha=1.0, edgecolor="none")

    axis.set_xticks(y_pos)
    axis.set_xticklabels(x_tick_labels, rotation=270, fontsize=tick_font_size, fontfamily=font_family)
    axis.tick_params(axis='x', pad=2)
    if x_axis_label:
        axis.set_xlabel(x_axis_label, fontdict={
            'fontsize': axis_font_size,
            'fontweight': 'regular',
            'fontfamily': 'Raleway',
        }, labelpad=x_label_pad)

    if use_rel:
        y_tick_labels = (str(int(tick)) for tick in axis.get_yticks())
    else:
        y_tick_labels = [str(int(tick * 1e-5)) for tick in axis.get_yticks()]
        y_tick_labels[0] = '0'
    axis.set_yticklabels(y_tick_labels, fontsize=tick_font_size, fontfamily=font_family)
    axis.tick_params(axis='y', pad=2)
    axis.set_ylabel(y_axis_label, fontdict={
        'fontsize': axis_font_size,
        'fontweight': 'regular',
        'fontfamily': 'Raleway',
    })

    axis.set_title(title, fontdict={
        'fontsize': axis_font_size,
        'fontweight': 'regular',
        'fontfamily': 'Raleway',
    })
    axis.set_xlim((-0.75, len(id_freqs) - 0.25))

    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)

    return axis, bars