KarrLab/bpforms

View on GitHub
examples/pro.py

Summary

Maintainability
F
2 wks
Test Coverage
""" Generate BpForms for all of the proteins in PRO, verify
them, and calculate their properties

:Author: Jonathan Karr <karr@mssm.edu>
:Date: 2019-06-24
:Copyright: 2019, Karr Lab
:License: MIT
"""

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from matplotlib import pyplot
from xml.etree import ElementTree
import bpforms
import copy
import csv
import matplotlib
import numpy
import os
import pickle
import re
import requests
import requests_cache

IN_URL = 'https://proconsortium.org/download/current/pro_nonreasoned.obo'
IN_OBO_FILENAME = os.path.join('examples', 'pro_nonreasoned.obo')
IN_PKL_FILENAME = os.path.join('examples', 'pro_nonreasoned.pkl')
IN_TSV_FILELANE = os.path.join('examples', 'pro_input.in.tsv')  # from Darren Natale
IN_MONOMERS_FILENAME = os.path.join('examples', 'pro.monomers.csv')
UNIPROT_SEQ_ENDPOINT = 'https://www.uniprot.org/uniprot/{}.fasta'
UNIPROT_XML_ENDPOINT = 'https://www.uniprot.org/uniprot/{}.xml'

OUT_PICKLE_FILENAME = os.path.join('examples', 'pro_input.out.pkl')
OUT_PICKLE_FILENAME_2 = os.path.join('examples', 'pro_input.out.2.pkl')
OUT_TSV_FILENAME = os.path.join('examples', 'pro_input.out.tsv')
OUT_FASTA_FILENAME = os.path.join('examples', 'pro_input.fasta')
OUT_FIG_FILENAME = os.path.join('examples', 'pro_input.svg')
OUT_STRUCTURE_DIRNAME = os.path.join('examples', 'pro_input_structure')
OUT_VIZ_DIRNAME = os.path.join('examples', 'pro_input_viz')

cache_name = os.path.join('examples', 'pro')
session = requests_cache.core.CachedSession(cache_name, backend='sqlite', expire_after=None)
session.mount('https://www.uniprot.org/', requests.adapters.HTTPAdapter(max_retries=5))

AA_CHARS_TO_CODES = {
    'Ala': 'A',
    'Arg': 'R',
    'Asn': 'N',
    'Asp': 'D',
    'Cys': 'C',
    'Glu': 'E',
    'Gln': 'Q',
    'Gly': 'G',
    'His': 'H',
    'Ile': 'I',
    'Leu': 'L',
    'Lys': 'K',
    'Met': 'M',
    'Phe': 'F',
    'Pro': 'P',
    'Ser': 'S',
    'Thr': 'T',
    'Trp': 'W',
    'Tyr': 'Y',
    'Val': 'V',
}


def run(in_obo_filename=IN_OBO_FILENAME, in_pkl_filename=IN_PKL_FILENAME, in_tsv_filename=IN_TSV_FILELANE,
        in_monomers_filename=IN_MONOMERS_FILENAME,
        max_num_proteins=None,
        out_pickle_filename=OUT_PICKLE_FILENAME, out_pickle_filename_2=OUT_PICKLE_FILENAME_2,
        out_tsv_filename=OUT_TSV_FILENAME, out_fasta_filename=OUT_FASTA_FILENAME,
        out_fig_filename=OUT_FIG_FILENAME, out_structure_dirname=OUT_STRUCTURE_DIRNAME,
        out_viz_dirname=OUT_VIZ_DIRNAME):
    """ Download PRO ontology, generate proteoforms, and encode with BpForms

    Args:
        in_obo_filename (:obj:`str`, optional): path to save/read PRO ontology in OBO format
        in_pkl_filename (:obj:`str`, optional): path to save/read parsed content of PRO ontology
        in_tsv_filename (:obj:`str`, optional): path to read PRO entries in TSV format
        in_monomers_filename (:obj:`str`, optional): path to list of ids of monomeric forms used
            by PRO and their alphabet code in tab-separated format
        max_num_proteins (:obj:`int`, optional): maximum number of proteins to analyze
        out_pickle_filename (:obj:`str`, optional): path to save results in pickle format
        out_pickle_filename_2 (:obj:`str`, optional): path to save results in pickle format
        out_tsv_filename (:obj:`str`, optional): path to save results in tab-separated format
        out_fasta_filename (:obj:`str`, optional): path to save results in FASTA format
        out_fig_filename (:obj:`str`, optional): path to save plot of results
        out_structure_dirname (:obj:`str`, optional): path to save preoteoforms in CML format
        out_viz_dirname (:obj:`str`, optional): path to save preoteoforms im SVG format

    Returns:
        :obj:`list` of :obj:`dict`: proteoforms encoded with BpForms
    """
    # get the PRO ontology and extract the modified proteins from the ontology
    # proteins = get_pro_from_obo(obo_filename=in_obo_filename, pkl_filename=in_pkl_filename, max_num_proteins=max_num_proteins)
    proteins = get_pro_from_tsv(in_tsv_filename, max_num_proteins=max_num_proteins)

    # parse the modified proteins and retrieve their sequences
    if not os.path.isfile(out_pickle_filename):
        # parse the modified proteins and retrieve their sequences
        parsed_proteins = []
        for i_protein, protein in enumerate(proteins):
            if i_protein % 100 == 0:
                print('Parsing protein {} of {}'.format(i_protein + 1, len(proteins)))
            parsed_proteins.append(parse_protein(protein))

        # save the parsed proteins in pickle format
        with open(out_pickle_filename, 'wb') as file:
            pickle.dump(parsed_proteins, file)
    else:
        # load saved parsed proteins in pickle format
        with open(out_pickle_filename, 'rb') as file:
            parsed_proteins = pickle.load(file)

    # read list of monomers
    monomers = {}
    with open(in_monomers_filename, 'r') as file:
        reader = csv.DictReader(file, dialect='excel')
        for row in reader:
            monomers[row['PRO id']] = {
                'mod': bpforms.protein_alphabet.monomers.get(row['BpForms code'], None),
                'origin': [],
            }
            if row['Base monomer']:
                monomers[row['PRO id']]['origin'] = row['Base monomer'].split(', ')

    # generate list of modified monomeric forms
    for protein in parsed_proteins:
        for modification in protein['modifications']:
            if modification['monomer'] not in monomers:
                monomers[modification['monomer']] = {
                    'mod': None,
                    'origin': [],
                }

    # print list of unmapped monomers
    unmapped_monomers = []
    for monomer, code in monomers.items():
        if not code['mod']:
            unmapped_monomers.append(monomer)
    unmapped_monomers.sort()
    if unmapped_monomers:
        print('Several PRO monomeric forms have not been mapped to BpForms monomeric forms:\n  {}'.format(
            '\n  '.join(unmapped_monomers)))

    # check for inconsistencies between residue and modified monomeric form
    monomer_codes = {}
    for code, monomer in bpforms.protein_alphabet.monomers.items():
        monomer_codes[monomer] = code

    for protein in parsed_proteins:
        for modification in protein.get('modifications', []):
            if modification['residue'] and modification['monomer']:
                monomer = monomers.get(modification['monomer'], None)
                if (monomer['mod'] and monomer['mod'].get_canonical_code(monomer_codes) != modification['residue']) \
                        or (monomer['origin'] and modification['residue'] not in monomer['origin']):
                    codes = set(monomer['origin'])
                    if monomer['mod']:
                        codes.add(monomer['mod'].get_canonical_code(monomer_codes))
                    msg = 'Modified monomeric form {} potentially inconsistent with residue {} != {}'.format(
                        modification['monomer'], modification['residue'],
                        ', '.join(codes))
                    print(protein['id'] + ': ' + msg)

    # generate BpForms for each protein
    if not os.path.isdir(out_structure_dirname):
        os.mkdir(out_structure_dirname)

    if not os.path.isdir(out_viz_dirname):
        os.mkdir(out_viz_dirname)

    if not os.path.isfile(out_pickle_filename_2):
        for i_protein, protein in enumerate(parsed_proteins):
            if i_protein % 100 == 0:
                print('Generating BpForms {} of {}'.format(i_protein + 1, len(parsed_proteins)))

            protein['modified_seq'] = None
            if not protein['uniprot_id']:
                continue
            if not protein['seq']:
                continue
            if protein['pro_errors']:
                continue

            processed_form = gen_bpform(protein, monomers, monomer_codes, apply_modifications=False)
            protein['processed_seq'] = str(processed_form)
            if not processed_form.validate():
                processed_formula = processed_form.get_formula()
                protein['processed_formula'] = str(processed_formula)
                protein['processed_mol_wt'] = processed_form.get_mol_wt()
                protein['processed_charge'] = processed_form.get_charge()

            if not protein['modifications']:
                continue

            modified_form = gen_bpform(protein, monomers, monomer_codes, include_annotations=False)
            protein['modified_seq'] = str(modified_form)

            modified_form = gen_bpform(protein, monomers, monomer_codes)
            if not modified_form.validate():
                modified_formula = modified_form.get_formula()
                protein['modified_full_seq'] = str(modified_form)
                protein['modified_formula'] = str(modified_formula)
                protein['modified_mol_wt'] = modified_form.get_mol_wt()
                protein['modified_charge'] = modified_form.get_charge()

                protein['modifications_formula'] = str(modified_formula - processed_formula)
                protein['modifications_mol_wt'] = protein['modified_mol_wt'] - protein['processed_mol_wt']
                protein['modifications_charge'] = protein['modified_charge'] - protein['processed_charge']

                # with open(os.path.join(out_structure_dirname, protein['id'] + '.cml'), 'w') as file:
                #     file.write(modified_form.export('cml'))

                form = gen_bpform(protein, monomers, monomer_codes,
                                  apply_processing=False, include_annotations=True)
                seq_features = []
                if protein['processing']:
                    seq_features.append({
                        'label': 'Processed',
                        'color': '#cccccc',
                        'positions': [],
                    })
                    last = 0
                    for p in protein['processing']:
                        seq_features[0]['positions'].append([last + 1, p['start'] - 1])
                        last = p['end']
                    seq_features[0]['positions'].append([
                        protein['processing'][-1]['end'] + 1,
                        len(form.seq),
                    ])

                    if protein['processing'][0]['start'] == 1:
                        seq_features[0]['positions'].pop(0)
                    if protein['processing'][-1]['end'] == len(form.seq):
                        seq_features[0]['positions'].pop(len(seq_features[0]['positions']) - 1)

                with open(os.path.join(out_viz_dirname, protein['id'] + '.svg'), 'w') as file:
                    file.write(form.get_genomic_image(seq_features, width=910))

            if modified_form.get_canonical_seq(monomer_codes) != protein['processed_seq']:
                protein['pro_errors'].append('Modified sequence for {} not compatible with the processed sequence'.format(
                    protein['id']))

        # save the parsed proteins in pickle format
        with open(out_pickle_filename_2, 'wb') as file:
            pickle.dump(parsed_proteins, file)
    else:
        with open(out_pickle_filename_2, 'rb') as file:
            parsed_proteins = pickle.load(file)

    # save the proteoforms in TSV format
    with open(out_tsv_filename, 'w') as file:
        writer = csv.writer(file, dialect='excel-tab')
        writer.writerow(['PRO id', 'UniProt id', 'Organism',
                         'Unmodified sequence (IUBMB)',
                         'Processing', 'Deletions', 'Processsed sequence (IUBMB)', 'Processsed formula',
                         'Processsed molecular weight', 'Processsed charge',
                         'Modifications', 'Crosslinks', 'Modified sequence (abbreviated BpForms)', 'Modified sequence (BpForms)',
                         'Is modified sequence concrete', 'Modified formula', 'Modified molecular weight', 'Modified charge',
                         'Modifications formula', 'Modifications molecular weight', 'Modifications charge',
                         'PRO issues', 'Monomeric form issues'])

        for parsed_protein in parsed_proteins:
            if parsed_protein.get('pro_errors', None):
                pro_errors = '. '.join(parsed_protein['pro_errors']) + '.'
            else:
                pro_errors = None

            if parsed_protein.get('modified_errors', None):
                modified_errors = '. '.join(parsed_protein['modified_errors']) + '.'
            else:
                modified_errors = None

            writer.writerow([
                parsed_protein['id'],
                parsed_protein.get('uniprot_id', None),
                parsed_protein.get('organism', None),
                parsed_protein.get('seq', None),
                ', '.join('{}-{}'.format(p['start'], p['end']) for p in parsed_protein['processing']),
                ', '.join('{}-{}'.format(deletion[0], deletion[1]) for deletion in parsed_protein.get('deletions', [])),
                parsed_protein.get('processed_seq', None),
                parsed_protein.get('processed_formula', None),
                parsed_protein.get('processed_mol_wt', None),
                parsed_protein.get('processed_charge', None),
                ', '.join('{} --> {} ({})'.format(m['residue'] or '?', m['monomer'], ', '.join(str(p) for p in m['positions']))
                          for m in parsed_protein['modifications']),
                ', '.join('{}{}-{}{}'.format(xlink[0][1], xlink[0][0], xlink[1][1], xlink[1][0])
                          for xlink in parsed_protein.get('crosslinks', [])),
                parsed_protein.get('modified_seq', None),
                parsed_protein.get('modified_full_seq', None),
                parsed_protein.get('modified_concrete', False),
                parsed_protein.get('modified_formula', None),
                parsed_protein.get('modified_mol_wt', None),
                parsed_protein.get('modified_charge', None),
                parsed_protein.get('modifications_formula', None),
                parsed_protein.get('modifications_mol_wt', None),
                parsed_protein.get('modifications_charge', None),
                pro_errors,
                modified_errors,
            ])

    # save the proteoforms in FASTA format
    seqs = (SeqRecord(id='{} | {}'.format(protein['id'], protein['uniprot_id']),
                      seq=Seq(protein['modified_seq']),
                      description='')
            for protein in parsed_proteins
            if protein['modified_seq'])
    SeqIO.write(seqs, out_fasta_filename, "fasta")

    # analyze frequency of modifications
    plot_modifications(parsed_proteins, fig_filename=out_fig_filename)

    # return proteins
    return proteins, parsed_proteins


def get_pro_from_obo(obo_filename=IN_OBO_FILENAME, pkl_filename=IN_PKL_FILENAME, max_num_proteins=None):
    """ Get the PRO ontology and extract the modified proteins from the ontology

    Args:
        obo_filename (:obj:`str`, optional): filename to save PRO in OBO format
        pkl_filename (:obj:`str`, optional): filename to save/read PRO from pickled file
        max_num_proteins (:obj:`int`, optional): maximum number of proteins to analyze

    Returns:
        :obj:`list` of :obj:`dict`: list of PRO ontology terms for modified proteins
    """
    # download PRO
    if not os.path.isfile(obo_filename):
        response = requests.get(IN_URL)
        response.raise_for_status()
        with open(obo_filename, 'wb') as file:
            file.write(response.content)

    # parse PRO or read from cache
    if not os.path.isfile(pkl_filename):
        # parse PRO
        proteins = []
        protein = None
        with open(obo_filename, 'r') as file:
            for line in file:
                line = line.rstrip('\n')
                if line.startswith('['):
                    if line.startswith('[Term]'):
                        if max_num_proteins is not None and len(proteins) >= max_num_proteins:
                            break
                        protein = {}
                    else:
                        protein = None
                elif line and protein is not None:
                    key, _, value = line.partition(': ')
                    if key not in protein:
                        protein[key] = []
                    protein[key].append(value)

                    if key == 'comment' and value.startswith('Category=organism-modification.'):
                        proteins.append(protein)

        # save PRO in pickle format
        with open(pkl_filename, 'wb') as file:
            pickle.dump(proteins, file)
    else:
        # load PRO from pickle format
        with open(pkl_filename, 'rb') as file:
            proteins = pickle.load(file)

        if max_num_proteins is not None and max_num_proteins < len(proteins):
            proteins = proteins[0:max_num_proteins]

    # return PRO
    return proteins


def get_pro_from_tsv(filename, max_num_proteins=None):
    """ Extract PRO entries from TSV file

    Args:
        obo_filename (:obj:`str`, optional): filename to save PRO in OBO format
        max_num_proteins (:obj:`int`, optional): maximum number of proteins to analyze

    Returns:
        :obj:`list` of :obj:`dict`: list of PRO ontology terms for modified proteins
    """
    proteins = []
    with open(filename, 'r') as file:
        reader = csv.DictReader(file, fieldnames=('id', 'category', 'synonym_type', 'seq'), dialect='excel-tab')
        for row in reader:
            proteins.append({
                'id': [row['id']],
                'category': [row['category']],
                'synonym': ['"{}" {} PRO-proteoform-std'.format(row['seq'], row['synonym_type'])],
            })

            if max_num_proteins is not None and len(proteins) >= max_num_proteins:
                break

    return proteins


def parse_protein(protein):
    """ Parse the modification information from a term for a modified protein

    Args:
        protein (:obj:`dict`): term for a modified protein

    Returns:
        :obj:`dict` with PRO id, UniProt id, processing start position, processing end position, unmodified sequence, and modifications
    """
    assert len(protein['id']) == 1
    id = protein['id'][0]
    errors = []

    seq_synonyms = []
    for synonym in protein.get('synonym', []):
        if synonym.startswith('"UniProtKB:') and ' PRO-proteoform-std' in synonym:
            seq_synonyms.append(synonym)
    if not seq_synonyms:
        errors.append('No synonym which defines a modified sequence')
        return {
            'id': id,
            'uniprot_id': None,
            'processing': [],
            'modifications': [],
            'seq': None,
            'pro_errors': errors,
        }
    elif len(seq_synonyms) > 1:
        errors.append('Multiple synonyms which define modified sequences')
    synonym = seq_synonyms[0]

    uniprot_id, _, processing_modifications_type = synonym.partition(', ')
    uniprot_id = uniprot_id.partition(':')[2]

    organism_name = None
    response = session.get(UNIPROT_XML_ENDPOINT.format(uniprot_id))
    response.raise_for_status()
    if response.content:
        xml_root = ElementTree.fromstring(response.content)
        entry = xml_root.find('{http://uniprot.org/uniprot}entry')
        organism = entry.find('{http://uniprot.org/uniprot}organism')
        names = organism.findall('{http://uniprot.org/uniprot}name')
        for name in names:
            if name.get('type') == 'scientific':
                organism_name = name.text
                break

    response = session.get(UNIPROT_SEQ_ENDPOINT.format(uniprot_id))
    response.raise_for_status()
    seq = response.content.decode('utf-8').partition('\n')[2].replace('\n', '')
    if not seq:
        errors.append('No sequence for UniProt entry; entry may be deprecated')

    processing_modifications = processing_modifications_type.partition('"')[0]
    processing = []
    while True:
        match = re.match(r'^(\?|\d+)\-(\?|\d+)(, |$)', processing_modifications)
        if match:
            if match.group(1) == '?':
                start = None
                errors.append('Unknown processing start position')
            else:
                start = int(float(match.group(1)))
                if start <= 0 or start > len(seq):
                    errors.append('Start position must be within sequence')
            if match.group(2) == '?':
                end = None
                errors.append('Unknown processing end position')
            else:
                end = int(float(match.group(2)))
                if end <= 0 or end > len(seq):
                    errors.append('End position must be within sequence')

            if start and end and start > end:
                errors.append('End position must be after start position')

            processing.append({
                'start': start,
                'end': end,
            })
            processing_modifications = processing_modifications[len(match.group(0)):]
        else:
            break
    if processing_modifications.startswith('which') \
            or processing_modifications.startswith('with') \
            or 'MOD:00046 OR Thr-163, MOD:00047' in processing_modifications:
        modifications_str = []
        errors.append('Unable to parse sequence')
    elif processing_modifications:
        modifications_str = processing_modifications.split('|')
    else:
        modifications_str = []

    modifications = []
    for modification in modifications_str:
        if modification and modification[0] == '(' and modification[-1] == ')':
            modification = modification[1:-1]

        if ' or ' in modification or ' and/or ' in modification:
            errors.append('Boolean logic not supported')
            continue

        if ', ' in modification:
            residue_positions, _, monomer = modification.partition(', ')

            residue_codes = set()
            positions = []
            for residue_position in residue_positions.split('/'):
                residue_chars, _, position = residue_position.partition('-')
                residue_code = AA_CHARS_TO_CODES[residue_chars]
                position = int(float(position))
                if position > len(seq):
                    errors.append('Position {} is greater than the sequence length {}'.format(position, len(seq)))
                elif seq[position - 1] != residue_code:
                    errors.append('Position {} != {}'.format(position, residue_code))
                residue_codes.add(residue_code)
                positions.append(position)
            if len(residue_codes) != 1 and monomer != 'PR:000026291':
                residue_code = None
                errors.append('Residues {{{}}} annotated with the same modification {}'.format(
                    ', '.join(residue_codes), monomer))
        else:
            residue_code = None
            positions = []
            monomer = modification

        if monomer == 'PR:000026291':
            for residue_code in residue_codes:
                modifications.append({
                    'residue': residue_code,
                    'positions': [p for p in positions if seq[p - 1] == residue_code],
                    'monomer': monomer,
                })
        else:
            modifications.append({
                'residue': residue_code,
                'positions': positions,
                'monomer': monomer,
            })

    return {
        'id': id,
        'uniprot_id': uniprot_id,
        'organism': organism_name,
        'processing': processing,
        'modifications': modifications,
        'seq': seq,
        'pro_errors': errors,
    }


def gen_bpform(protein, pro_ids_to_bpform_monomers, monomer_codes,
               apply_processing=True, apply_modifications=True, include_annotations=True):
    """ Generate BpForm for a modified protein in PRO

    Args:
        protein (:obj:`dict`): term for modified protein
        pro_ids_to_bpform_monomers (:obj:`dict`): dictionary which maps ids of monomeric forms
            used by PRO to monomeric forms in the BpForms protein alphabet
        monomer_codes (:obj:`dict`): dictionary that maps monomers to their codes in the alphabet
        apply_processing (:obj:`bool`, optional): if :obj:`True`, include processing in proteoform
        apply_modifications (:obj:`bool`, optional): if :obj:`True`, include modifications in proteoform
        include_annotations (:obj:`bool`, optional): if :obj:`True`, include metadata about modified monomers

    Returns:
        :obj:`bpforms.ProteinForm`: BpForm for a term in PRO
    """
    form = bpforms.ProteinForm()
    monomers = bpforms.protein_alphabet.monomers

    # generate BpForm for unmodified sequence
    for base in protein['seq']:
        form.seq.append(monomers[base])

    # apply processing
    modifications = copy.deepcopy(protein['modifications'])
    seq = protein['seq']
    if apply_processing and protein['processing']:
        procesed_seq = []
        seq = ''
        for processing in protein['processing']:
            procesed_seq.extend(form.seq[processing['start']-1:processing['end']])
            seq += protein['seq'][processing['start']-1:processing['end']]
        form.seq = procesed_seq

        for modification in modifications:
            modification['processed_positions'] = []
            for position in modification['positions']:
                seq_len = 0
                processed_position = None
                for processing in protein['processing']:
                    if position >= processing['start'] and position <= processing['end']:
                        processed_position = seq_len + position - processing['start'] + 1
                        break
                    seq_len += processing['end'] - processing['start'] + 1
                if processed_position is not None:
                    modification['processed_positions'].append(processed_position)
    else:
        for modification in modifications:
            modification['processed_positions'] = modification['positions']

    # apply modifications
    if apply_modifications:
        concrete = True
        protein['modified_errors'] = []

        for modification in modifications:
            monomer = pro_ids_to_bpform_monomers[modification['monomer']]['mod']
            origin = pro_ids_to_bpform_monomers[modification['monomer']]['origin']

            if modification['monomer'].startswith('CHEBI:'):
                mod_ns = 'chebi'
            elif modification['monomer'].startswith('MOD:'):
                mod_ns = 'mod'
            elif modification['monomer'].startswith('PR:'):
                mod_ns = 'pr'
            elif modification['monomer'].startswith('UniCarbKB:'):
                mod_ns = 'unicarbkb'
            else:
                raise ValueError('Unsupported identifier {}'.format(modification['monomer']))

            if modification['monomer'] == 'PR:000026291':
                if include_annotations:
                    monomer = bpforms.Monomer().from_dict(
                        monomers[modification['residue']].to_dict(
                            alphabet=bpforms.protein_alphabet),
                        alphabet=bpforms.protein_alphabet)
                else:
                    monomer = bpforms.Monomer()
                monomer.id = None
                monomer.name = None
                monomer.synonyms = []
                monomer.identifiers = [bpforms.Identifier('pr', modification['monomer'])]
                monomer.comments = None

            elif modification['monomer'].startswith('CHEBI:'):
                if include_annotations:
                    monomer = bpforms.Monomer().from_dict(
                        monomers[modification['residue']].to_dict(
                            alphabet=bpforms.protein_alphabet),
                        alphabet=bpforms.protein_alphabet)
                else:
                    monomer = bpforms.Monomer()
                monomer.id = None
                monomer.name = None
                monomer.synonyms = []
                monomer.identifiers = [bpforms.Identifier('chebi', modification['monomer'])]
                monomer.comments = None

            elif monomer is None:
                concrete = False

                monomer = bpforms.Monomer(
                    identifiers=[bpforms.Identifier(mod_ns, modification['monomer'])])

            if modification['positions']:
                for position in modification['processed_positions']:
                    if form.seq[position - 1] == monomers[seq[position - 1]]:
                        if monomer not in bpforms.protein_alphabet.monomers.values():
                            monomer.base_monomers = [form.seq[position - 1]]
                        form.seq[position - 1] = monomer
                    else:
                        protein['modified_errors'].append('Unable to set monomeric form at position {}'.format(
                            position))

            elif modification['residue']:
                concrete = False

                if include_annotations:
                    monomer2 = bpforms.Monomer().from_dict(
                        monomer.to_dict(
                            alphabet=bpforms.protein_alphabet),
                        alphabet=bpforms.protein_alphabet)
                else:
                    monomer2 = bpforms.Monomer()
                monomer2.id = None
                monomer2.name = None
                monomer2.synonyms = []
                monomer2.identifiers = [bpforms.Identifier(mod_ns, modification['monomer'])]
                monomer2.base_monomers = [bpforms.protein_alphabet.monomers.get(modification['positions'])]

                monomer2.start_position = seq.find(modification['residue']) + 1
                monomer2.end_position = seq.rfind(modification['residue']) + 1
                set_monomer = False
                for i_monomer in range(monomer2.start_position, monomer2.end_position + 1):
                    if form.seq[i_monomer - 1] == monomers[seq[i_monomer - 1]]:
                        set_monomer = True
                        form.seq[i_monomer - 1] = monomer2
                        break
                if not set_monomer:
                    protein['modified_errors'].append('Unable to set monomeric form')
            else:
                concrete = False

                canonical_code = monomer.get_canonical_code(monomer_codes)
                if include_annotations:
                    monomer2 = bpforms.Monomer().from_dict(
                        monomer.to_dict(
                            alphabet=bpforms.protein_alphabet),
                        alphabet=bpforms.protein_alphabet)
                else:
                    monomer2 = bpforms.Monomer()
                monomer2.id = None
                monomer2.name = None
                monomer2.synonyms = []
                monomer2.identifiers = [bpforms.Identifier(mod_ns, modification['monomer'])]
                monomer2.monomers_position = [
                    bpforms.protein_alphabet.monomers.get(code) for code in origin]

                if canonical_code and canonical_code != '?':
                    start_position = seq.find(canonical_code) + 1
                    end_position = seq.rfind(canonical_code) + 1
                    if start_position == 0:
                        protein['modified_errors'].append('Sequence does not contain residue {} for modification {}'.format(
                            canonical_code, modification['monomer']))
                    else:
                        monomer2.start_position = start_position
                        monomer2.end_position = end_position

                elif origin:
                    start_position = float('inf')
                    end_position = -float('inf')
                    for base in origin:
                        start_pos = seq.find(base) + 1
                        if start_pos > 0:
                            start_position = min(start_position, start_pos)

                        end_pos = seq.rfind(base) + 1
                        if end_pos > 0:
                            end_position = max(end_position, end_pos)

                    if numpy.isinf(start_position):
                        protein['modified_errors'].append('Sequence does not contain residues {} for modification {}'.format(
                            ', '.join(origin), modification['monomer']))
                    else:
                        monomer2.start_position = start_position
                        monomer2.end_position = end_position

                else:
                    monomer2.start_position = 1
                    monomer2.end_position = len(seq)

                if monomer2.start_position:
                    set_monomer = False
                    for i_monomer in range(monomer2.start_position, monomer2.end_position + 1):
                        if form.seq[i_monomer - 1] == monomers[seq[i_monomer - 1]]:
                            monomer2.base_monomers = [bpforms.protein_alphabet.monomers.get(seq[i_monomer - 1])]
                            form.seq[i_monomer - 1] = monomer2
                            set_monomer = True
                            break
                    if not set_monomer:
                        protein['modified_errors'].append('Unable to set monomeric form')

    # crosslinks
    if protein['processing']:
        xlinks = []
        seq_len = 0
        protein['crosslinks'] = []
        protein['deletions'] = []
        for left, right in zip(protein['processing'][0:-1], protein['processing'][1:]):
            seq_len += left['end'] - left['start'] + 1
            i_left = seq_len
            i_right = i_left + 1

            if left['end'] + 1 == right['start']:
                protein['crosslinks'].append(((left['end'], protein['seq'][left['end']-1]),
                                              (right['start'], protein['seq'][right['start'] - 1])))
            else:
                protein['deletions'].append((left['end'] + 1, right['start'] - 1))

            if left['end'] + 1 != right['start']:
                continue

            #err = False
            # if protein['seq'][left['end'] - 1] != 'C' or form.seq[i_left - 1] != bpforms.protein_alphabet.monomers.C:
            #    err = True
            #    protein['modified_errors'].append('Disulfide bond site {}{} != C'.format(
            #        protein['seq'][left['end'] - 1], left['end']))
            # if protein['seq'][right['start'] - 1] != 'C' or form.seq[i_right - 1] != bpforms.protein_alphabet.monomers.C:
            #    err = True
            #    protein['modified_errors'].append('Disulfide bond site {}{} != C'.format(
            #        protein['seq'][right['start'] - 1], right['start']))
            #
            # if err:
            #    continue

            concrete = False

            i_left = '{}-{}'.format(seq_len - (left['end'] - left['start'] + 1) + 1, seq_len)
            i_right = '{}-{}'.format(seq_len + 1, seq_len + (right['end'] - right['start'] + 1))
            if apply_modifications:
                form.crosslinks.add(bpforms.Bond(
                    #l_bond_atoms=[bpforms.Atom(bpforms.Monomer, 'S', position=11, monomer=i_left)],
                    #r_bond_atoms=[bpforms.Atom(bpforms.Monomer, 'S', position=11, monomer=i_right)],
                    #l_displaced_atoms=[bpforms.Atom(bpforms.Monomer, 'H', position=11, monomer=i_left)],
                    #r_displaced_atoms=[bpforms.Atom(bpforms.Monomer, 'H', position=11, monomer=i_right)],
                    comments='The polymer contains a disulfide bond between the ranges {} and {}'.format(i_left, i_right),
                ))

    # validate
    if apply_modifications:
        protein['modified_concrete'] = concrete
        protein['modified_errors'].extend(form.validate())

    # return proteoform represented with BpForms
    return form


def plot_modifications(proteins, organism='Homo sapiens', fig_filename=OUT_FIG_FILENAME):
    """ Plot a summary of the modifications in PRO

    Args:
        proteins (:obj:`list` of :obj:`dict`): entries in PRO ontology
        organism (:obj:`str`, optional): organism to analyze
        fig_filename (:obj:`str`, optional): path to save analysis
    """
    code_freq = {}
    canonical_code_freq = {}
    for protein in proteins:
        if (organism is None or protein.get('organism', None) == organism) and protein.get('modified_seq', None):
            for modification in protein['modifications']:
                if modification['residue'] and modification['monomer']:
                    n_mods = max(1, len(modification['positions']))
                    if modification['residue'] not in canonical_code_freq:
                        canonical_code_freq[modification['residue']] = 0
                    if modification['monomer'] not in code_freq:
                        code_freq[modification['monomer']] = 0
                    canonical_code_freq[modification['residue']] += n_mods
                    code_freq[modification['monomer']] += n_mods

    pyplot.style.use('ggplot')
    fig, axes = pyplot.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [1, 4]})
    fig.set_size_inches(9.3, 1.5)
    plot_codes(canonical_code_freq,
               'Frequency of modifications',
               axes[0], ignore_canonical=False)
    plot_codes(code_freq,
               'Frequency of modified monomeric forms',
               axes[1], ignore_canonical=True)
    fig.savefig(fig_filename, transparent=True,
                bbox_inches=matplotlib.transforms.Bbox([[0.69, -0.5], [8.35, 1.5]]))
    pyplot.close(fig)


def plot_codes(code_freq, title, axis, ignore_canonical=False):
    id_freqs = []
    for code, count in code_freq.items():
        if ignore_canonical and code in ['A', 'C', 'G', 'U']:
            continue
        id_freqs.append((code, count))
    id_freqs.sort()

    y_pos = numpy.arange(len(id_freqs))
    freq = numpy.array([id_freq[-1] for id_freq in id_freqs])
    freq = freq / numpy.sum(freq) * 100.
    x_tick_labels = {id: y_pos for y_pos, (id, _) in enumerate(id_freqs)}

    axis.bar(y_pos, freq, align='center', alpha=0.5)
    axis.set_xticks(y_pos)
    axis.set_xticklabels(x_tick_labels, rotation=270, fontsize=6, fontfamily='Raleway')
    axis.set_ylabel('Frequency (%)', fontdict={
        'fontsize': 10,
        'fontweight': 'regular',
        'fontfamily': 'Raleway',
    })
    axis.set_title(title, fontdict={
        'fontsize': 10,
        'fontweight': 'regular',
        'fontfamily': 'Raleway',
    })
    axis.set_xlim((-0.75, len(id_freqs) - 0.25))