Debugging FANTOM5 HG38 labels.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from crr_labels import fantom"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"enhancers, promoters = fantom(\n",
" cell_lines=[\"GM12878\"], # list of cell lines to be considered.\n",
" window_size=256, # window size to use for the various regions.\n",
" genome = \"hg38\", # considered genome version. Currently supported only \"hg19\".\n",
" center_enhancers = \"peak\", # how to center the enhancer window, either around \"peak\" or the \"center\" of the region.\n",
" enhancers_threshold = 0, # activation threshold for the enhancers.\n",
" promoters_threshold = 5, # activation threshold for the promoters.\n",
" drop_always_inactive_rows = False, # whether to drop the rows where no activation is detected for every row.\n",
" binarize = True, # whether to return the data binary-encoded, zero for inactive, one for active.\n",
" nrows = None # the number of rows to read, useful when testing pipelines for creating smaller datasets.\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"promoters[~promoters.lifted].to_csv(\"promoters_hg38.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chrom</th>\n",
" <th>chromStart</th>\n",
" <th>chromEnd</th>\n",
" <th>strand</th>\n",
" <th>GM12878</th>\n",
" <th>lifted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr1</td>\n",
" <td>564344</td>\n",
" <td>564600</td>\n",
" <td>+</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr1</td>\n",
" <td>564393</td>\n",
" <td>564649</td>\n",
" <td>+</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr1</td>\n",
" <td>565022</td>\n",
" <td>565278</td>\n",
" <td>+</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr1</td>\n",
" <td>565227</td>\n",
" <td>565483</td>\n",
" <td>+</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr1</td>\n",
" <td>565285</td>\n",
" <td>565541</td>\n",
" <td>+</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96717</th>\n",
" <td>chrY</td>\n",
" <td>21906594</td>\n",
" <td>21906850</td>\n",
" <td>-</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96718</th>\n",
" <td>chrY</td>\n",
" <td>21906623</td>\n",
" <td>21906879</td>\n",
" <td>-</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96719</th>\n",
" <td>chrY</td>\n",
" <td>21906761</td>\n",
" <td>21907017</td>\n",
" <td>-</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96720</th>\n",
" <td>chrY</td>\n",
" <td>23613727</td>\n",
" <td>23613983</td>\n",
" <td>-</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96721</th>\n",
" <td>chrX</td>\n",
" <td>52386980</td>\n",
" <td>52387236</td>\n",
" <td>-</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96722 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" chrom chromStart chromEnd strand GM12878 lifted\n",
"0 chr1 564344 564600 + 0 True\n",
"1 chr1 564393 564649 + 0 True\n",
"2 chr1 565022 565278 + 0 True\n",
"3 chr1 565227 565483 + 0 True\n",
"4 chr1 565285 565541 + 0 True\n",
"... ... ... ... ... ... ...\n",
"96717 chrY 21906594 21906850 - 0 True\n",
"96718 chrY 21906623 21906879 - 0 True\n",
"96719 chrY 21906761 21907017 - 0 True\n",
"96720 chrY 23613727 23613983 - 0 True\n",
"96721 chrX 52386980 52387236 - 0 True\n",
"\n",
"[96722 rows x 6 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"promoters[promoters.lifted]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"##ColumnVariables[00Annotation]=CAGE peak id\r\n",
"##ColumnVariables[short_description]=short form of the description below. Common descriptions in the long descriptions has been omited\r\n",
"##ColumnVariables[description]=description of the CAGE peak\r\n",
"##ColumnVariables[association_with_transcript]=transcript which 5end is the nearest to the the CAGE peak\r\n",
"##ColumnVariables[entrezgene_id]=entrezgene (genes) id associated with the transcript\r\n",
"##ColumnVariables[hgnc_id]=hgnc (genes) id associated with the transcript\r\n",
"##ColumnVariables[uniprot_id]=uniprot (protein) id associated with the transcript\r\n",
"##ParemeterValue[genome_assembly]=hg38\r\n",
"##ColumnVariables[tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode]=TPM (tags per million) of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode\r\n",
"##ColumnVariables[tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode]=TPM (tags per million) of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode\r\n"
]
}
],
"source": [
"!head \"fantom_data/hg38_fair+new_CAGE_peaks_phase1and2_tpm_ann.osc.txt\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"fantom_data = pd.read_csv(\n",
" \"fantom_data/hg38_fair+new_CAGE_peaks_phase1and2_tpm_ann.osc.txt\",\n",
" sep=\"\\t\",\n",
" comment=\"#\",\n",
" low_memory=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"enhancers_data = pd.read_csv(\n",
" \"fantom_data/F5.hg38.enhancers.expression.tpm.matrix.gz\",\n",
" sep=\"\\t\",\n",
" low_memory=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>CNhs11844</th>\n",
" <th>CNhs11251</th>\n",
" <th>CNhs11282</th>\n",
" <th>CNhs10746</th>\n",
" <th>CNhs11253</th>\n",
" <th>CNhs13053</th>\n",
" <th>CNhs13054</th>\n",
" <th>CNhs13502</th>\n",
" <th>CNhs13052</th>\n",
" <th>...</th>\n",
" <th>CNhs10654</th>\n",
" <th>CNhs10635</th>\n",
" <th>CNhs11766</th>\n",
" <th>CNhs11765</th>\n",
" <th>CNhs10612</th>\n",
" <th>CNhs13464</th>\n",
" <th>CNhs11676</th>\n",
" <th>CNhs11763</th>\n",
" <th>CNhs12854</th>\n",
" <th>CNhs12844</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr10:100006233-100006603</td>\n",
" <td>1.168411</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.207444</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.149924</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr10:100008181-100008444</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.075973</td>\n",
" <td>0.201635</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr10:100014348-100014634</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.192173</td>\n",
" <td>0.097232</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.080305</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr10:100020065-100020562</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1.442423</td>\n",
" <td>0.000000</td>\n",
" <td>0.604904</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr10:100043485-100043744</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.234773</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.075973</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63280</th>\n",
" <td>chrY:7520195-7520556</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.149924</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63281</th>\n",
" <td>chrY:7724230-7724512</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63282</th>\n",
" <td>chrY:7769899-7770218</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.455839</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63283</th>\n",
" <td>chrY:7796227-7796534</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63284</th>\n",
" <td>chrY:8007529-8007781</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>63285 rows × 1830 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 CNhs11844 CNhs11251 CNhs11282 CNhs10746 \\\n",
"0 chr10:100006233-100006603 1.168411 0.0 0.000000 0.000000 \n",
"1 chr10:100008181-100008444 0.000000 0.0 0.000000 0.000000 \n",
"2 chr10:100014348-100014634 0.000000 0.0 0.192173 0.097232 \n",
"3 chr10:100020065-100020562 0.000000 0.0 0.000000 0.000000 \n",
"4 chr10:100043485-100043744 0.000000 0.0 0.000000 0.000000 \n",
"... ... ... ... ... ... \n",
"63280 chrY:7520195-7520556 0.000000 0.0 0.000000 0.000000 \n",
"63281 chrY:7724230-7724512 0.000000 0.0 0.000000 0.000000 \n",
"63282 chrY:7769899-7770218 0.000000 0.0 0.000000 0.000000 \n",
"63283 chrY:7796227-7796534 0.000000 0.0 0.000000 0.000000 \n",
"63284 chrY:8007529-8007781 0.000000 0.0 0.000000 0.000000 \n",
"\n",
" CNhs11253 CNhs13053 CNhs13054 CNhs13502 CNhs13052 ... CNhs10654 \\\n",
"0 0.207444 0.0 0.000000 0.0 0.0 ... 0.000000 \n",
"1 0.000000 0.0 0.000000 0.0 0.0 ... 0.000000 \n",
"2 0.000000 0.0 0.000000 0.0 0.0 ... 0.080305 \n",
"3 0.000000 0.0 0.000000 0.0 0.0 ... 0.000000 \n",
"4 0.000000 0.0 0.234773 0.0 0.0 ... 0.000000 \n",
"... ... ... ... ... ... ... ... \n",
"63280 0.000000 0.0 0.000000 0.0 0.0 ... 0.000000 \n",
"63281 0.000000 0.0 0.000000 0.0 0.0 ... 0.000000 \n",
"63282 0.000000 0.0 0.000000 0.0 0.0 ... 0.000000 \n",
"63283 0.000000 0.0 0.000000 0.0 0.0 ... 0.000000 \n",
"63284 0.000000 0.0 0.000000 0.0 0.0 ... 0.000000 \n",
"\n",
" CNhs10635 CNhs11766 CNhs11765 CNhs10612 CNhs13464 CNhs11676 \\\n",
"0 0.149924 0.0 0.000000 0.000000 0.000000 0.0 \n",
"1 0.000000 0.0 0.000000 0.075973 0.201635 0.0 \n",
"2 0.000000 0.0 0.000000 0.000000 0.000000 0.0 \n",
"3 0.000000 0.0 1.442423 0.000000 0.604904 0.0 \n",
"4 0.000000 0.0 0.000000 0.075973 0.000000 0.0 \n",
"... ... ... ... ... ... ... \n",
"63280 0.149924 0.0 0.000000 0.000000 0.000000 0.0 \n",
"63281 0.000000 0.0 0.000000 0.000000 0.000000 0.0 \n",
"63282 0.000000 0.0 0.000000 0.455839 0.000000 0.0 \n",
"63283 0.000000 0.0 0.000000 0.000000 0.000000 0.0 \n",
"63284 0.000000 0.0 0.000000 0.000000 0.000000 0.0 \n",
"\n",
" CNhs11763 CNhs12854 CNhs12844 \n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"63280 0.0 0.0 0.0 \n",
"63281 0.0 0.0 0.0 \n",
"63282 0.0 0.0 0.0 \n",
"63283 0.0 0.0 0.0 \n",
"63284 0.0 0.0 0.0 \n",
"\n",
"[63285 rows x 1830 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"enhancers_data"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>00Annotation</th>\n",
" <th>short_description</th>\n",
" <th>description</th>\n",
" <th>association_with_transcript</th>\n",
" <th>entrezgene_id</th>\n",
" <th>hgnc_id</th>\n",
" <th>uniprot_id</th>\n",
" <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode</th>\n",
" <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode</th>\n",
" <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode</th>\n",
" <th>...</th>\n",
" <th>tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode</th>\n",
" <th>tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode</th>\n",
" <th>tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode</th>\n",
" <th>tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode</th>\n",
" <th>tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode</th>\n",
" <th>tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode</th>\n",
" <th>tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode</th>\n",
" <th>tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode</th>\n",
" <th>tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode</th>\n",
" <th>tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>95719</th>\n",
" <td>hg19::chr8:146017737..146017748,-;hg_95721.1</td>\n",
" <td>p2@RPL8</td>\n",
" <td>CAGE_peak_2_at_RPL8_5end</td>\n",
" <td>2bp_to_ENST00000528957.5,uc064rpz.1_5end</td>\n",
" <td>6132</td>\n",
" <td>HGNC:10368</td>\n",
" <td>P62917</td>\n",
" <td>28.246498</td>\n",
" <td>28.931535</td>\n",
" <td>27.971083</td>\n",
" <td>...</td>\n",
" <td>15.298796</td>\n",
" <td>8.461104</td>\n",
" <td>7.610119</td>\n",
" <td>9.904102</td>\n",
" <td>8.346005</td>\n",
" <td>1.706283</td>\n",
" <td>6.655068</td>\n",
" <td>1.938596</td>\n",
" <td>3.94557</td>\n",
" <td>10.681473</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 1836 columns</p>\n",
"</div>"
],
"text/plain": [
" 00Annotation short_description \\\n",
"95719 hg19::chr8:146017737..146017748,-;hg_95721.1 p2@RPL8 \n",
"\n",
" description association_with_transcript \\\n",
"95719 CAGE_peak_2_at_RPL8_5end 2bp_to_ENST00000528957.5,uc064rpz.1_5end \n",
"\n",
" entrezgene_id hgnc_id uniprot_id \\\n",
"95719 6132 HGNC:10368 P62917 \n",
"\n",
" tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode \\\n",
"95719 28.246498 \n",
"\n",
" tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode \\\n",
"95719 28.931535 \n",
"\n",
" tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode \\\n",
"95719 27.971083 \n",
"\n",
" ... \\\n",
"95719 ... \n",
"\n",
" tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode \\\n",
"95719 15.298796 \n",
"\n",
" tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode \\\n",
"95719 8.461104 \n",
"\n",
" tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode \\\n",
"95719 7.610119 \n",
"\n",
" tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode \\\n",
"95719 9.904102 \n",
"\n",
" tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode \\\n",
"95719 8.346005 \n",
"\n",
" tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode \\\n",
"95719 1.706283 \n",
"\n",
" tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode \\\n",
"95719 6.655068 \n",
"\n",
" tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode \\\n",
"95719 1.938596 \n",
"\n",
" tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode \\\n",
"95719 3.94557 \n",
"\n",
" tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode \n",
"95719 10.681473 \n",
"\n",
"[1 rows x 1836 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fantom_data[fantom_data[\"00Annotation\"].str.contains(\"146017737\")]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>00Annotation</th>\n",
" <th>short_description</th>\n",
" <th>description</th>\n",
" <th>association_with_transcript</th>\n",
" <th>entrezgene_id</th>\n",
" <th>hgnc_id</th>\n",
" <th>uniprot_id</th>\n",
" <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode</th>\n",
" <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode</th>\n",
" <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode</th>\n",
" <th>...</th>\n",
" <th>tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode</th>\n",
" <th>tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode</th>\n",
" <th>tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode</th>\n",
" <th>tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode</th>\n",
" <th>tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode</th>\n",
" <th>tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode</th>\n",
" <th>tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode</th>\n",
" <th>tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode</th>\n",
" <th>tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode</th>\n",
" <th>tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>95717</th>\n",
" <td>hg19::chr8:146016750..146016780,-;hg_95719.1</td>\n",
" <td>p3@RPL8</td>\n",
" <td>CAGE_peak_3_at_RPL8_5end</td>\n",
" <td>494bp_to_ENST00000534781.1,uc064rpt.1_5end</td>\n",
" <td>6132</td>\n",
" <td>HGNC:10368</td>\n",
" <td>NaN</td>\n",
" <td>3.116855</td>\n",
" <td>3.034776</td>\n",
" <td>2.700656</td>\n",
" <td>...</td>\n",
" <td>0.709216</td>\n",
" <td>2.193620</td>\n",
" <td>1.902530</td>\n",
" <td>0.931155</td>\n",
" <td>6.259504</td>\n",
" <td>0.511885</td>\n",
" <td>0.000000</td>\n",
" <td>3.877193</td>\n",
" <td>3.507173</td>\n",
" <td>1.869258</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95718</th>\n",
" <td>hg19::chr8:146016823..146016858,-;hg_95720.1</td>\n",
" <td>p4@RPL8</td>\n",
" <td>CAGE_peak_4_at_RPL8_5end</td>\n",
" <td>-443bp_to_BC000047_5end</td>\n",
" <td>6132</td>\n",
" <td>HGNC:10368</td>\n",
" <td>NaN</td>\n",
" <td>5.649300</td>\n",
" <td>4.855642</td>\n",
" <td>4.822601</td>\n",
" <td>...</td>\n",
" <td>0.405266</td>\n",
" <td>3.133742</td>\n",
" <td>5.707589</td>\n",
" <td>3.047416</td>\n",
" <td>2.086501</td>\n",
" <td>1.023770</td>\n",
" <td>2.047713</td>\n",
" <td>1.292398</td>\n",
" <td>5.041561</td>\n",
" <td>4.005552</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95719</th>\n",
" <td>hg19::chr8:146017737..146017748,-;hg_95721.1</td>\n",
" <td>p2@RPL8</td>\n",
" <td>CAGE_peak_2_at_RPL8_5end</td>\n",
" <td>2bp_to_ENST00000528957.5,uc064rpz.1_5end</td>\n",
" <td>6132</td>\n",
" <td>HGNC:10368</td>\n",
" <td>P62917</td>\n",
" <td>28.246498</td>\n",
" <td>28.931535</td>\n",
" <td>27.971083</td>\n",
" <td>...</td>\n",
" <td>15.298796</td>\n",
" <td>8.461104</td>\n",
" <td>7.610119</td>\n",
" <td>9.904102</td>\n",
" <td>8.346005</td>\n",
" <td>1.706283</td>\n",
" <td>6.655068</td>\n",
" <td>1.938596</td>\n",
" <td>3.945570</td>\n",
" <td>10.681473</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 1836 columns</p>\n",
"</div>"
],
"text/plain": [
" 00Annotation short_description \\\n",
"95717 hg19::chr8:146016750..146016780,-;hg_95719.1 p3@RPL8 \n",
"95718 hg19::chr8:146016823..146016858,-;hg_95720.1 p4@RPL8 \n",
"95719 hg19::chr8:146017737..146017748,-;hg_95721.1 p2@RPL8 \n",
"\n",
" description association_with_transcript \\\n",
"95717 CAGE_peak_3_at_RPL8_5end 494bp_to_ENST00000534781.1,uc064rpt.1_5end \n",
"95718 CAGE_peak_4_at_RPL8_5end -443bp_to_BC000047_5end \n",
"95719 CAGE_peak_2_at_RPL8_5end 2bp_to_ENST00000528957.5,uc064rpz.1_5end \n",
"\n",
" entrezgene_id hgnc_id uniprot_id \\\n",
"95717 6132 HGNC:10368 NaN \n",
"95718 6132 HGNC:10368 NaN \n",
"95719 6132 HGNC:10368 P62917 \n",
"\n",
" tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode \\\n",
"95717 3.116855 \n",
"95718 5.649300 \n",
"95719 28.246498 \n",
"\n",
" tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode \\\n",
"95717 3.034776 \n",
"95718 4.855642 \n",
"95719 28.931535 \n",
"\n",
" tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode \\\n",
"95717 2.700656 \n",
"95718 4.822601 \n",
"95719 27.971083 \n",
"\n",
" ... \\\n",
"95717 ... \n",
"95718 ... \n",
"95719 ... \n",
"\n",
" tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode \\\n",
"95717 0.709216 \n",
"95718 0.405266 \n",
"95719 15.298796 \n",
"\n",
" tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode \\\n",
"95717 2.193620 \n",
"95718 3.133742 \n",
"95719 8.461104 \n",
"\n",
" tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode \\\n",
"95717 1.902530 \n",
"95718 5.707589 \n",
"95719 7.610119 \n",
"\n",
" tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode \\\n",
"95717 0.931155 \n",
"95718 3.047416 \n",
"95719 9.904102 \n",
"\n",
" tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode \\\n",
"95717 6.259504 \n",
"95718 2.086501 \n",
"95719 8.346005 \n",
"\n",
" tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode \\\n",
"95717 0.511885 \n",
"95718 1.023770 \n",
"95719 1.706283 \n",
"\n",
" tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode \\\n",
"95717 0.000000 \n",
"95718 2.047713 \n",
"95719 6.655068 \n",
"\n",
" tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode \\\n",
"95717 3.877193 \n",
"95718 1.292398 \n",
"95719 1.938596 \n",
"\n",
" tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode \\\n",
"95717 3.507173 \n",
"95718 5.041561 \n",
"95719 3.945570 \n",
"\n",
" tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode \n",
"95717 1.869258 \n",
"95718 4.005552 \n",
"95719 10.681473 \n",
"\n",
"[3 rows x 1836 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fantom_data[95717: 95720]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}