wc_kb_gen/random/genome.py
"""
:Author: Ashwin Srinivasan <ashwins@mit.edu>
:Author: Bilal Shaikh <bilal.shaikh@columbia.edu>
:Author: Balazs Szigeti <balazs.szigeti@mssm.edu>
:Date: 2018-06-06
:Copyright: 2018, Karr Lab
:License: MIT
"""
import Bio.SeqIO
import Bio.SeqRecord
import math
import numpy
import random
import scipy.constants
import wc_kb
import wc_kb_gen
from Bio.Data import CodonTable
from Bio.Seq import Seq, Alphabet
from numpy import random
from scipy import stats
from wc_utils.util.units import unit_registry
from wc_onto import onto as wcOntology
from wc_utils.util.ontology import are_terms_equivalent
class GenomeGenerator(wc_kb_gen.KbComponentGenerator):
"""
Generate synthetic chromosome with randomized genes/intergenic
regions. Creates RNA and protein objects corresponding to the genes
this chromosome. Associates the chromosome, RNAs, proteins
with a knowledge base object (and its Cell attribute).
Options:
* num_chromosomes (:obj:`int`): number of chromosomes
* mean_gc_frac (:obj:`float`): fraction of nucleotides which are G or C
* num_genes (:obj:`float`): mean number of genes
* mean_gene_len (:obj:`float`): mean codon length of a gene
* mean_coding_frac (:obj:`float`): mean coding fraction of the genome
* translation_table (:obj:`int`): The NCBI standard genetic code used
* num_ncRNA (:obj:`float`): The proportion of non coding RNAs
* num_rRNA (:obj:`float`): The proportion of ribosomal RNAs
* tRNA_prop (:obj:`float`): The proportion of transfer RNAs
* five_prime_len (:obj:`int`): Average 5' UTR length for transcription units
* three_prime_len (:obj:`int`): Average 3' UTR length for transcription units
* operon_prop (:obj:`float`): Proportion of genes that should be in an operon (polycistronic mRNA)
* operon_gen_num (:obj:`int`): Average number of genes in an operon
* mean_copy_number (:obj:`float`): mean copy number of each RNA
* mean_half_life (:obj:`float`): mean half-life of RNAs
* genetic_code (:obj:`str`): 'normal' / 'reduced', if reduced only 'I': ['ATC'], 'L': ['CTG'],
'M': ['ATG'], 'T': ['ACG'] codons in genome
* seq_path (:obj:`str`): path to save genome sequence
"""
def clean_and_validate_options(self):
""" Apply default options and validate options """
# Default options are loosely based on Escherichia coli K-12
# Nucleic Acids Research 41:D605-12 2013
options = self.options
genetic_code = options.get('genetic_code', 'normal')
assert(genetic_code in ['normal', 'reduced'])
options['genetic_code'] = genetic_code
num_chromosomes = options.get('num_chromosomes', 1)
assert(num_chromosomes >= 1 and int(num_chromosomes) == num_chromosomes)
options['num_chromosomes'] = num_chromosomes
chromosome_topology = options.get('chromosome_topology', 'circular')
assert(chromosome_topology in ['circular', 'linear'])
options['chromosome_topology'] = chromosome_topology
num_genes = options.get('num_genes', 4500)
assert(num_genes >= 1)
options['num_genes'] = int(num_genes)
num_ncRNA = options.get('num_ncRNA', 10) # not sure
assert(isinstance(num_ncRNA, int))
options['num_ncRNA'] = num_ncRNA
# http://book.bionumbers.org/how-many-ribosomal-rna-gene-copies-are-in-the-genome/
num_rRNA = options.get('num_rRNA', 7)
assert(isinstance(num_rRNA, int))
options['num_rRNA'] = num_rRNA
num_tRNA = options.get('num_tRNA', 20)
assert(isinstance(num_tRNA, int))
options['num_tRNA'] = num_tRNA
min_prots = options.get('min_prots', 8)
assert(isinstance(min_prots, int))
options['min_prots'] = min_prots
assert((num_ncRNA + num_rRNA + num_tRNA + min_prots) <= num_genes)
mean_gc_frac = options.get('mean_gc_frac', 0.58)
assert(mean_gc_frac >= 0 and mean_gc_frac <= 1)
options['mean_gc_frac'] = mean_gc_frac
# DOI: 10.1093/molbev/msk019
mean_gene_len = options.get('mean_gene_len', 308) # codon length (924 bp)
assert(mean_gene_len >= 1)
options['mean_gene_len'] = mean_gene_len
# DOI: 10.1007/s10142-015-0433-4
mean_coding_frac = options.get('mean_coding_frac', 0.88)
assert(mean_coding_frac > 0 and mean_coding_frac < 1)
options['mean_coding_frac'] = mean_coding_frac
translation_table = int(options.get('translation_table', 1))
assert(translation_table in range(1, 32))
options['translation_table'] = translation_table
five_prime_len = int(options.get('five_prime_len', 7))
assert(five_prime_len >= 0)
options['five_prime_len'] = five_prime_len
three_prime_len = int(options.get('three_prime_len', 5)) # guess
assert(three_prime_len >= 0)
options['three_prime_len'] = three_prime_len
operon_prop = (options.get('operon_prop', 0.2)) # guess
assert(operon_prop >= 0 and operon_prop <= 1)
options['operon_prop'] = operon_prop
operon_gen_num = int(options.get('operon_gen_num', 3))
assert(operon_gen_num >= 2)
options['operon_gen_num'] = operon_gen_num
mean_rna_half_life = options.get('mean_rna_half_life', 8 * 60)
assert(mean_rna_half_life > 0)
options['mean_rna_half_life'] = mean_rna_half_life
# DOI: 10.1073/pnas.0308747101
mean_protein_half_life = options.get('mean_protein_half_life', 750 * 60)
assert(mean_protein_half_life > 0)
options['mean_protein_half_life'] = mean_protein_half_life
# DOI: 10.1038/ismej.2012.94
mean_rna_copy_number = options.get('mean_rna_copy_number', 0.4)
assert(mean_rna_copy_number > 0)
options['mean_rna_copy_number'] = mean_rna_copy_number
# DOI: 10.1038/ismej.2012.94
mean_protein_copy_number = options.get('mean_protein_copy_number', 75)
assert(mean_protein_copy_number > 0)
options['mean_protein_copy_number'] = mean_protein_copy_number
seq_path = options.get('seq_path', 'rand_seq.fna')
options['seq_path'] = seq_path
def gen_components(self):
self.gen_genome()
self.gen_tus()
self.gen_rnas_proteins()
self.gen_concentrations()
self.reduce_model()
def gen_genome(self):
'''Construct knowledge base components and generate the DNA sequence'''
# get options
options = self.options
genetic_code = options.get('genetic_code')
num_chromosomes = options.get('num_chromosomes')
mean_gene_len = options.get('mean_gene_len')
translation_table = options.get('translation_table')
num_genes = options.get('num_genes')
mean_coding_frac = options.get('mean_coding_frac')
mean_gc_frac = options.get('mean_gc_frac')
chromosome_topology = options.get('chromosome_topology')
num_ncRNA = options.get('num_ncRNA')
num_rRNA = options.get('num_rRNA')
num_tRNA = options.get('num_tRNA')
min_prots = options.get('min_prots')
seq_path = options.get('seq_path')
cell = self.knowledge_base.cell
self.knowledge_base.translation_table = translation_table
codon_table = CodonTable.unambiguous_dna_by_id[translation_table]
BASES = ['A', 'C', 'G', 'T']
PROB_BASES = [(1 - mean_gc_frac) / 2, mean_gc_frac /2, mean_gc_frac/2, (1-mean_gc_frac)/2]
if genetic_code == 'normal':
START_CODONS = codon_table.start_codons
STOP_CODONS = codon_table.stop_codons
elif genetic_code == 'reduced':
START_CODONS = ['TTA']
STOP_CODONS = ['TAA']
num_genes_all = num_genes
assignList = num_tRNA*[wcOntology['WC:tRNA']] + \
num_rRNA*[wcOntology['WC:rRNA']] + \
num_ncRNA*[wcOntology['WC:ncRNA']] + \
(num_genes_all-(num_ncRNA + num_tRNA + num_rRNA))*[wcOntology['WC:mRNA']]
random.shuffle(assignList)
# Create a chromosome n times
dna_seqs = []
for i_chr in range(num_chromosomes):
num_genes = math.ceil(num_genes_all / num_chromosomes)
gene_lens = 3 * self.rand(mean_gene_len, count=num_genes, min=2)
intergene_lens = 3 * self.rand(mean_gene_len / mean_coding_frac * (1 - mean_coding_frac), count=num_genes)
seq_len = numpy.sum(gene_lens) + numpy.sum(intergene_lens)
# Generate seq based on random codons (NOT start/stop codons)
seq_str = []
if genetic_code=='normal':
for i in range(0, seq_len, 3):
codon_i = random.choice(STOP_CODONS)
codon_i = "".join(random.choice(BASES, p=PROB_BASES, size=(3,)))
seq_str.append(codon_i)
elif genetic_code=='reduced':
for i in range(0, seq_len, 3):
codon_i = STOP_CODONS[0]
codon_i = "".join(random.choice(['ATC', 'CTG', 'ATG', 'ACG']))
seq_str.append(codon_i)
seq_str = "".join(seq_str)
seq = Seq(seq_str, Alphabet.DNAAlphabet())
chro = cell.species_types.get_or_create(id='chr_{}'.format(i_chr + 1), __type=wc_kb.core.DnaSpeciesType)
chro.name = 'Chromosome {}'.format(i_chr + 1)
chro.circular = chromosome_topology == 'circular'
chro.double_stranded = True
chro.sequence_path = seq_path
gene_starts = numpy.int64(numpy.cumsum(numpy.concatenate(([0], gene_lens[0:-1])) +
numpy.concatenate((numpy.round(intergene_lens[0:1] / 2), intergene_lens[1:]))))
# creates GeneLocus objects for the genes and labels their GeneType (which type of RNA they transcribe)
for i_gene, gene_start in enumerate(gene_starts):
gene = self.knowledge_base.cell.loci.get_or_create(
id='gene_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.GeneLocus)
gene.start = gene_start + 1 # 1-indexed
gene.polymer = chro
gene.end = gene.start + gene_lens[i_gene] - 1 # 1-indexed
gene.name = 'gene {} {}'.format(i_chr+1, i_gene+1)
if len(assignList) > 0:
gene.type = assignList.pop()
else:
gene.type = wcOntology['WC:mRNA']
if gene.type == wcOntology['WC:mRNA']: # if mRNA, then set up start/stop codons in the gene
start_codon = random.choice(START_CODONS)
stop_codon = random.choice(STOP_CODONS)
seq_str = str(seq)
seq_str = seq_str[:gene.start-1] + \
start_codon + \
seq_str[gene.start+2: gene.end-3] + \
stop_codon + seq_str[gene.end:]
for i in range(gene.start+2, gene.end-3, 3):
while seq_str[i:i+3] in START_CODONS or seq_str[i:i+3] in STOP_CODONS:
if genetic_code == 'normal':
codon_i = "".join(random.choice(BASES, p=PROB_BASES, size=(3,)))
elif genetic_code == 'reduced':
codon_i = "".join(random.choice(['ATC', 'CTG', 'ATG', 'ACG']))
seq_str = seq_str[:i]+codon_i+seq_str[i+3:]
seq = Seq(seq_str, Alphabet.DNAAlphabet())
dna_seqs.append(Bio.SeqRecord.SeqRecord(seq, chro.id))
with open(seq_path, 'w') as file:
writer = Bio.SeqIO.FastaIO.FastaWriter(
file, wrap=70, record2title=lambda record: record.id)
writer.write_file(dna_seqs)
def gen_tus(self):
""" Creates transcription units with 5'/3' UTRs, polycistronic mRNAs, and other types of RNA (tRNA, rRNA, sRNA) """
options = self.options
five_prime_len = options.get('five_prime_len') # 7 bp default (E. coli, wikipedia)
three_prime_len = options.get('three_prime_len') # 5 bp default guess
operon_prop = options.get('operon_prop') # 0.2 default guess
operon_gen_num = options.get('operon_gen_num') # 3 genes default (https://academic.oup.com/gbe/article/5/11/2242/653613)
for i_chr, chromosome in enumerate(self.knowledge_base.cell.species_types.get(__type=wc_kb.core.DnaSpeciesType)):
seq = chromosome.get_seq()
i_gene = 0
transcription_loci = []
# Todo make this into a proper for loop that deals with repeats/additional loci
while i_gene < len(chromosome.loci):
gene = chromosome.loci[i_gene]
if gene.type == wcOntology['WC:mRNA']:
# polycistronic mRNA (multiple GeneLocus objects per TranscriptionUnitLocus)
five_prime = self.rand(five_prime_len)[0]
three_prime = self.rand(three_prime_len)[0]
operon_prob = random.random()
# make an operon (polycistronic mRNA, put multiple genes in one TransUnitLocus)
if operon_prob <= operon_prop:
operon_genes = self.rand(operon_gen_num, min=2)[0]
# add 3', 5' UTRs to the ends of the transcription unit (upstream of first gene, downstream of last gene)
tu = self.knowledge_base.cell.loci.get_or_create(
id='tu_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.TranscriptionUnitLocus)
tu.name = 'tu {} {}'.format(i_chr+1, i_gene+1)
five_prime_start = gene.start - five_prime
if five_prime_start < 0:
five_prime_start = 0
tu.genes.append(gene)
tu.start = five_prime_start
for k in range(operon_genes-1):
i_gene += 1
if i_gene >= len(chromosome.loci):
break
if (chromosome.loci[i_gene]).type == wcOntology['WC:mRNA']:
gene = chromosome.loci[i_gene]
tu.genes.append(gene)
else:
break
three_prime_end = gene.end + three_prime
if three_prime_end >= len(seq):
three_prime_end = len(seq) - 1
tu.end = three_prime_end
transcription_loci.append(tu)
else: # make an individual transcription unit for the gene
five_prime_start = gene.start - five_prime
three_prime_end = gene.end + three_prime
if five_prime_start < 0:
five_prime_start = 0
if three_prime_end >= len(seq):
three_prime_end = len(seq) - 1
tu = self.knowledge_base.cell.loci.get_or_create(
id='tu_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.TranscriptionUnitLocus)
tu.start = five_prime_start
tu.end = three_prime_end
tu.name = 'tu {} {}'.format(i_chr+1, i_gene+1)
tu.genes.append(gene)
transcription_loci.append(tu)
i_gene += 1
# make a transcription unit that transcribes other types of RNA (tRNA, rRNA, sRNA)
else:
tu = self.knowledge_base.cell.loci.get_or_create(
id='tu_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.TranscriptionUnitLocus)
tu.name = 'tu {} {}'.format(i_chr+1, i_gene+1)
tu.start = gene.start
tu.end = gene.end
tu.genes.append(gene)
transcription_loci.append(tu)
i_gene += 1
for locus in transcription_loci:
locus.polymer = chromosome
def gen_rnas_proteins(self):
""" Creates RNA and protein objects corresponding to genes on chromosome. """
cell = self.knowledge_base.cell
options = self.options
mean_rna_half_life = options.get('mean_rna_half_life')
mean_protein_half_life = options.get('mean_protein_half_life')
for chromosome in self.knowledge_base.cell.species_types.get(__type=wc_kb.core.DnaSpeciesType):
for i in range(len(chromosome.loci)):
locus = chromosome.loci[i]
if type(locus) == wc_kb.prokaryote.TranscriptionUnitLocus:
tu = locus
# creates RnaSpeciesType for RNA sequence corresponding to gene
rna = self.knowledge_base.cell.species_types.get_or_create(id='rna_{}'.format(tu.id), __type=wc_kb.prokaryote.RnaSpeciesType)
rna.name = 'rna {}'.format(tu.id)
# GeneLocus object for gene sequence, attribute of ProteinSpeciesType object
if are_terms_equivalent(tu.genes[0].type, wcOntology['WC:mRNA']):
rna.type = wcOntology['WC:mRNA']
elif are_terms_equivalent(tu.genes[0].type, wcOntology['WC:rRNA']):
rna.type = wcOntology['WC:rRNA']
elif are_terms_equivalent(tu.genes[0].type, wcOntology['WC:tRNA']):
rna.type = wcOntology['WC:tRNA']
elif are_terms_equivalent(tu.genes[0].type, wcOntology['WC:ncRNA']):
rna.type = wcOntology['WC:ncRNA']
rna.half_life = random.normal(mean_rna_half_life, numpy.sqrt(mean_rna_half_life))
rna.transcription_units.append(tu)
if are_terms_equivalent(rna.type, wcOntology['WC:mRNA']):
for gene in tu.genes:
# creates ProteinSpecipe object for corresponding protein sequence(s)
prot = self.knowledge_base.cell.species_types.get_or_create(
id='prot_{}'.format(gene.id),
__type=wc_kb.prokaryote.ProteinSpeciesType)
prot.name = 'prot_{}'.format(gene.id)
prot.cell = cell
prot.cell.knowledge_base = self.knowledge_base
prot.gene = gene
prot.rna = rna
prot.half_life = random.normal(mean_protein_half_life, numpy.sqrt(mean_protein_half_life))
def gen_concentrations(self):
""" Creates the concentration objects of RNA and protein objects """
options = self.options
cell = self.knowledge_base.cell
cytosol = cell.compartments.get_one(id='c')
mean_rna_copy_number = options.get('mean_rna_copy_number')
mean_protein_copy_number = options.get('mean_protein_copy_number')
if self.knowledge_base.cell.parameters.get_one(id='mean_volume') is not None:
mean_volume = self.knowledge_base.cell.parameters.get_one(id='mean_volume').value
else:
mean_volume = 0.000000000000000067
print('"mean_volume" parameter is missing, using Mycoplasma pneumoniae value (6.7E-17L).')
for rna in cell.species_types.get(__type=wc_kb.prokaryote.RnaSpeciesType):
rna_specie = rna.species.get_or_create(compartment=cytosol)
conc = round(abs(random.normal(loc=mean_rna_copy_number,scale=15))) / scipy.constants.Avogadro / mean_volume
cell.concentrations.get_or_create(id='CONC({})'.format(rna_specie.id), species=rna_specie, value=conc, units=unit_registry.parse_units('M'))
for prot in cell.species_types.get(__type=wc_kb.prokaryote.ProteinSpeciesType):
prot_specie = prot.species.get_or_create(compartment=cytosol)
conc = round(abs(random.normal(loc=mean_protein_copy_number,scale=15))) / scipy.constants.Avogadro / mean_volume
cell.concentrations.get_or_create(id='CONC({})'.format(prot_specie.id), species=prot_specie, value=conc, units=unit_registry.parse_units('M'))
def reduce_model(self):
options = self.options
genetic_code = options.get('genetic_code')
seq_path = options.get('seq_path')
if genetic_code == 'normal':
pass
elif genetic_code == 'reduced':
cell = self.knowledge_base.cell
kb = self.knowledge_base
bases = "TCAG"
codons = [a + b + c for a in bases for b in bases for c in bases]
dna = kb.cell.species_types.get(__type = wc_kb.core.DnaSpeciesType)[0]
seq_str = str(dna.get_seq())
seq_list = list(seq_str)
for prot in kb.cell.species_types.get(__type = wc_kb.prokaryote.ProteinSpeciesType):
for base_num in range(prot.gene.start+2,prot.gene.end-3,3):
new_codon = random.choice(['ATC', 'CTG', 'ATG', 'ACG'])
seq_list[base_num]=new_codon[0]
seq_list[base_num+1]=new_codon[1]
seq_list[base_num+2]=new_codon[2]
seq_str_new = ''.join(seq_list)
seq=Seq(seq_str_new)
dna_seqs = [Bio.SeqRecord.SeqRecord(seq, dna.id)]
with open(seq_path, 'w') as file:
writer = Bio.SeqIO.FastaIO.FastaWriter(
file, wrap=70, record2title=lambda record: record.id)
writer.write_file(dna_seqs)
def rand(self, mean, count=1, min=0, max=numpy.inf):
""" Generated 1 or more random normally distributed integer(s) with standard deviation equal
to the square root of the mean value.
Args:
mean (:obj:`float`): mean value
count (:obj:`int`): number of random numbers to generate
Returns:
:obj:`int` or :obj:`numpy.ndarray` of :obj:`int`: random normally distributed integer(s)
"""
a = (min-mean)/numpy.sqrt(mean)
b = (max - mean)/numpy.sqrt(mean)
return numpy.int64(numpy.round(stats.truncnorm.rvs(a, b, loc=mean, scale=numpy.sqrt(mean), size=count)))