LucaCappelletti94/crr_labels

View on GitHub
Debugging FANTOM5 HG38 labels.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from crr_labels import fantom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "enhancers, promoters = fantom(\n",
    "    cell_lines=[\"GM12878\"], # list of cell lines to be considered.\n",
    "    window_size=256, # window size to use for the various regions.\n",
    "    genome = \"hg38\", # considered genome version. Currently supported only \"hg19\".\n",
    "    center_enhancers = \"peak\", # how to center the enhancer window, either around \"peak\" or the \"center\" of the region.\n",
    "    enhancers_threshold = 0, # activation threshold for the enhancers.\n",
    "    promoters_threshold = 5, # activation threshold for the promoters.\n",
    "    drop_always_inactive_rows = False, # whether to drop the rows where no activation is detected for every row.\n",
    "    binarize = True, # whether to return the data binary-encoded, zero for inactive, one for active.\n",
    "    nrows = None # the number of rows to read, useful when testing pipelines for creating smaller datasets.\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "promoters[~promoters.lifted].to_csv(\"promoters_hg38.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>chrom</th>\n",
       "      <th>chromStart</th>\n",
       "      <th>chromEnd</th>\n",
       "      <th>strand</th>\n",
       "      <th>GM12878</th>\n",
       "      <th>lifted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>chr1</td>\n",
       "      <td>564344</td>\n",
       "      <td>564600</td>\n",
       "      <td>+</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>chr1</td>\n",
       "      <td>564393</td>\n",
       "      <td>564649</td>\n",
       "      <td>+</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>chr1</td>\n",
       "      <td>565022</td>\n",
       "      <td>565278</td>\n",
       "      <td>+</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chr1</td>\n",
       "      <td>565227</td>\n",
       "      <td>565483</td>\n",
       "      <td>+</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>chr1</td>\n",
       "      <td>565285</td>\n",
       "      <td>565541</td>\n",
       "      <td>+</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96717</th>\n",
       "      <td>chrY</td>\n",
       "      <td>21906594</td>\n",
       "      <td>21906850</td>\n",
       "      <td>-</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96718</th>\n",
       "      <td>chrY</td>\n",
       "      <td>21906623</td>\n",
       "      <td>21906879</td>\n",
       "      <td>-</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96719</th>\n",
       "      <td>chrY</td>\n",
       "      <td>21906761</td>\n",
       "      <td>21907017</td>\n",
       "      <td>-</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96720</th>\n",
       "      <td>chrY</td>\n",
       "      <td>23613727</td>\n",
       "      <td>23613983</td>\n",
       "      <td>-</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96721</th>\n",
       "      <td>chrX</td>\n",
       "      <td>52386980</td>\n",
       "      <td>52387236</td>\n",
       "      <td>-</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>96722 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      chrom  chromStart  chromEnd strand  GM12878  lifted\n",
       "0      chr1      564344    564600      +        0    True\n",
       "1      chr1      564393    564649      +        0    True\n",
       "2      chr1      565022    565278      +        0    True\n",
       "3      chr1      565227    565483      +        0    True\n",
       "4      chr1      565285    565541      +        0    True\n",
       "...     ...         ...       ...    ...      ...     ...\n",
       "96717  chrY    21906594  21906850      -        0    True\n",
       "96718  chrY    21906623  21906879      -        0    True\n",
       "96719  chrY    21906761  21907017      -        0    True\n",
       "96720  chrY    23613727  23613983      -        0    True\n",
       "96721  chrX    52386980  52387236      -        0    True\n",
       "\n",
       "[96722 rows x 6 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "promoters[promoters.lifted]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "##ColumnVariables[00Annotation]=CAGE peak id\r\n",
      "##ColumnVariables[short_description]=short form of the description below. Common descriptions in the long descriptions has been omited\r\n",
      "##ColumnVariables[description]=description of the CAGE peak\r\n",
      "##ColumnVariables[association_with_transcript]=transcript which 5end is the nearest to the the CAGE peak\r\n",
      "##ColumnVariables[entrezgene_id]=entrezgene (genes) id associated with the transcript\r\n",
      "##ColumnVariables[hgnc_id]=hgnc (genes) id associated with the transcript\r\n",
      "##ColumnVariables[uniprot_id]=uniprot (protein) id associated with the transcript\r\n",
      "##ParemeterValue[genome_assembly]=hg38\r\n",
      "##ColumnVariables[tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode]=TPM (tags per million) of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode\r\n",
      "##ColumnVariables[tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode]=TPM (tags per million) of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode\r\n"
     ]
    }
   ],
   "source": [
    "!head \"fantom_data/hg38_fair+new_CAGE_peaks_phase1and2_tpm_ann.osc.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "fantom_data = pd.read_csv(\n",
    "    \"fantom_data/hg38_fair+new_CAGE_peaks_phase1and2_tpm_ann.osc.txt\",\n",
    "    sep=\"\\t\",\n",
    "    comment=\"#\",\n",
    "    low_memory=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "enhancers_data = pd.read_csv(\n",
    "    \"fantom_data/F5.hg38.enhancers.expression.tpm.matrix.gz\",\n",
    "    sep=\"\\t\",\n",
    "    low_memory=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>CNhs11844</th>\n",
       "      <th>CNhs11251</th>\n",
       "      <th>CNhs11282</th>\n",
       "      <th>CNhs10746</th>\n",
       "      <th>CNhs11253</th>\n",
       "      <th>CNhs13053</th>\n",
       "      <th>CNhs13054</th>\n",
       "      <th>CNhs13502</th>\n",
       "      <th>CNhs13052</th>\n",
       "      <th>...</th>\n",
       "      <th>CNhs10654</th>\n",
       "      <th>CNhs10635</th>\n",
       "      <th>CNhs11766</th>\n",
       "      <th>CNhs11765</th>\n",
       "      <th>CNhs10612</th>\n",
       "      <th>CNhs13464</th>\n",
       "      <th>CNhs11676</th>\n",
       "      <th>CNhs11763</th>\n",
       "      <th>CNhs12854</th>\n",
       "      <th>CNhs12844</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>chr10:100006233-100006603</td>\n",
       "      <td>1.168411</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207444</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.149924</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>chr10:100008181-100008444</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.075973</td>\n",
       "      <td>0.201635</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>chr10:100014348-100014634</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.192173</td>\n",
       "      <td>0.097232</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.080305</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chr10:100020065-100020562</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.442423</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.604904</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>chr10:100043485-100043744</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.234773</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.075973</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63280</th>\n",
       "      <td>chrY:7520195-7520556</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.149924</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63281</th>\n",
       "      <td>chrY:7724230-7724512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63282</th>\n",
       "      <td>chrY:7769899-7770218</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.455839</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63283</th>\n",
       "      <td>chrY:7796227-7796534</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63284</th>\n",
       "      <td>chrY:8007529-8007781</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>63285 rows × 1830 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                      Unnamed: 0  CNhs11844  CNhs11251  CNhs11282  CNhs10746  \\\n",
       "0      chr10:100006233-100006603   1.168411        0.0   0.000000   0.000000   \n",
       "1      chr10:100008181-100008444   0.000000        0.0   0.000000   0.000000   \n",
       "2      chr10:100014348-100014634   0.000000        0.0   0.192173   0.097232   \n",
       "3      chr10:100020065-100020562   0.000000        0.0   0.000000   0.000000   \n",
       "4      chr10:100043485-100043744   0.000000        0.0   0.000000   0.000000   \n",
       "...                          ...        ...        ...        ...        ...   \n",
       "63280       chrY:7520195-7520556   0.000000        0.0   0.000000   0.000000   \n",
       "63281       chrY:7724230-7724512   0.000000        0.0   0.000000   0.000000   \n",
       "63282       chrY:7769899-7770218   0.000000        0.0   0.000000   0.000000   \n",
       "63283       chrY:7796227-7796534   0.000000        0.0   0.000000   0.000000   \n",
       "63284       chrY:8007529-8007781   0.000000        0.0   0.000000   0.000000   \n",
       "\n",
       "       CNhs11253  CNhs13053  CNhs13054  CNhs13502  CNhs13052  ...  CNhs10654  \\\n",
       "0       0.207444        0.0   0.000000        0.0        0.0  ...   0.000000   \n",
       "1       0.000000        0.0   0.000000        0.0        0.0  ...   0.000000   \n",
       "2       0.000000        0.0   0.000000        0.0        0.0  ...   0.080305   \n",
       "3       0.000000        0.0   0.000000        0.0        0.0  ...   0.000000   \n",
       "4       0.000000        0.0   0.234773        0.0        0.0  ...   0.000000   \n",
       "...          ...        ...        ...        ...        ...  ...        ...   \n",
       "63280   0.000000        0.0   0.000000        0.0        0.0  ...   0.000000   \n",
       "63281   0.000000        0.0   0.000000        0.0        0.0  ...   0.000000   \n",
       "63282   0.000000        0.0   0.000000        0.0        0.0  ...   0.000000   \n",
       "63283   0.000000        0.0   0.000000        0.0        0.0  ...   0.000000   \n",
       "63284   0.000000        0.0   0.000000        0.0        0.0  ...   0.000000   \n",
       "\n",
       "       CNhs10635  CNhs11766  CNhs11765  CNhs10612  CNhs13464  CNhs11676  \\\n",
       "0       0.149924        0.0   0.000000   0.000000   0.000000        0.0   \n",
       "1       0.000000        0.0   0.000000   0.075973   0.201635        0.0   \n",
       "2       0.000000        0.0   0.000000   0.000000   0.000000        0.0   \n",
       "3       0.000000        0.0   1.442423   0.000000   0.604904        0.0   \n",
       "4       0.000000        0.0   0.000000   0.075973   0.000000        0.0   \n",
       "...          ...        ...        ...        ...        ...        ...   \n",
       "63280   0.149924        0.0   0.000000   0.000000   0.000000        0.0   \n",
       "63281   0.000000        0.0   0.000000   0.000000   0.000000        0.0   \n",
       "63282   0.000000        0.0   0.000000   0.455839   0.000000        0.0   \n",
       "63283   0.000000        0.0   0.000000   0.000000   0.000000        0.0   \n",
       "63284   0.000000        0.0   0.000000   0.000000   0.000000        0.0   \n",
       "\n",
       "       CNhs11763  CNhs12854  CNhs12844  \n",
       "0            0.0        0.0        0.0  \n",
       "1            0.0        0.0        0.0  \n",
       "2            0.0        0.0        0.0  \n",
       "3            0.0        0.0        0.0  \n",
       "4            0.0        0.0        0.0  \n",
       "...          ...        ...        ...  \n",
       "63280        0.0        0.0        0.0  \n",
       "63281        0.0        0.0        0.0  \n",
       "63282        0.0        0.0        0.0  \n",
       "63283        0.0        0.0        0.0  \n",
       "63284        0.0        0.0        0.0  \n",
       "\n",
       "[63285 rows x 1830 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enhancers_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>00Annotation</th>\n",
       "      <th>short_description</th>\n",
       "      <th>description</th>\n",
       "      <th>association_with_transcript</th>\n",
       "      <th>entrezgene_id</th>\n",
       "      <th>hgnc_id</th>\n",
       "      <th>uniprot_id</th>\n",
       "      <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode</th>\n",
       "      <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode</th>\n",
       "      <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode</th>\n",
       "      <th>...</th>\n",
       "      <th>tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode</th>\n",
       "      <th>tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode</th>\n",
       "      <th>tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode</th>\n",
       "      <th>tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode</th>\n",
       "      <th>tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode</th>\n",
       "      <th>tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode</th>\n",
       "      <th>tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode</th>\n",
       "      <th>tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode</th>\n",
       "      <th>tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode</th>\n",
       "      <th>tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>95719</th>\n",
       "      <td>hg19::chr8:146017737..146017748,-;hg_95721.1</td>\n",
       "      <td>p2@RPL8</td>\n",
       "      <td>CAGE_peak_2_at_RPL8_5end</td>\n",
       "      <td>2bp_to_ENST00000528957.5,uc064rpz.1_5end</td>\n",
       "      <td>6132</td>\n",
       "      <td>HGNC:10368</td>\n",
       "      <td>P62917</td>\n",
       "      <td>28.246498</td>\n",
       "      <td>28.931535</td>\n",
       "      <td>27.971083</td>\n",
       "      <td>...</td>\n",
       "      <td>15.298796</td>\n",
       "      <td>8.461104</td>\n",
       "      <td>7.610119</td>\n",
       "      <td>9.904102</td>\n",
       "      <td>8.346005</td>\n",
       "      <td>1.706283</td>\n",
       "      <td>6.655068</td>\n",
       "      <td>1.938596</td>\n",
       "      <td>3.94557</td>\n",
       "      <td>10.681473</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 1836 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       00Annotation short_description  \\\n",
       "95719  hg19::chr8:146017737..146017748,-;hg_95721.1           p2@RPL8   \n",
       "\n",
       "                    description               association_with_transcript  \\\n",
       "95719  CAGE_peak_2_at_RPL8_5end  2bp_to_ENST00000528957.5,uc064rpz.1_5end   \n",
       "\n",
       "      entrezgene_id     hgnc_id uniprot_id  \\\n",
       "95719          6132  HGNC:10368     P62917   \n",
       "\n",
       "       tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode  \\\n",
       "95719                                          28.246498                                                    \n",
       "\n",
       "       tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode  \\\n",
       "95719                                          28.931535                                                    \n",
       "\n",
       "       tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode  \\\n",
       "95719                                          27.971083                                                    \n",
       "\n",
       "       ...  \\\n",
       "95719  ...   \n",
       "\n",
       "       tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode  \\\n",
       "95719                                          15.298796                                             \n",
       "\n",
       "       tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode  \\\n",
       "95719                                           8.461104                                              \n",
       "\n",
       "       tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode  \\\n",
       "95719                                           7.610119                                    \n",
       "\n",
       "       tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode  \\\n",
       "95719                                           9.904102                                          \n",
       "\n",
       "       tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode  \\\n",
       "95719                                           8.346005                                  \n",
       "\n",
       "       tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode  \\\n",
       "95719                                           1.706283                       \n",
       "\n",
       "       tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode  \\\n",
       "95719                                           6.655068                        \n",
       "\n",
       "       tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode  \\\n",
       "95719                                           1.938596            \n",
       "\n",
       "       tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode  \\\n",
       "95719                                            3.94557          \n",
       "\n",
       "       tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode  \n",
       "95719                                          10.681473                                             \n",
       "\n",
       "[1 rows x 1836 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fantom_data[fantom_data[\"00Annotation\"].str.contains(\"146017737\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>00Annotation</th>\n",
       "      <th>short_description</th>\n",
       "      <th>description</th>\n",
       "      <th>association_with_transcript</th>\n",
       "      <th>entrezgene_id</th>\n",
       "      <th>hgnc_id</th>\n",
       "      <th>uniprot_id</th>\n",
       "      <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode</th>\n",
       "      <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode</th>\n",
       "      <th>tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode</th>\n",
       "      <th>...</th>\n",
       "      <th>tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode</th>\n",
       "      <th>tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode</th>\n",
       "      <th>tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode</th>\n",
       "      <th>tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode</th>\n",
       "      <th>tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode</th>\n",
       "      <th>tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode</th>\n",
       "      <th>tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode</th>\n",
       "      <th>tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode</th>\n",
       "      <th>tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode</th>\n",
       "      <th>tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>95717</th>\n",
       "      <td>hg19::chr8:146016750..146016780,-;hg_95719.1</td>\n",
       "      <td>p3@RPL8</td>\n",
       "      <td>CAGE_peak_3_at_RPL8_5end</td>\n",
       "      <td>494bp_to_ENST00000534781.1,uc064rpt.1_5end</td>\n",
       "      <td>6132</td>\n",
       "      <td>HGNC:10368</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.116855</td>\n",
       "      <td>3.034776</td>\n",
       "      <td>2.700656</td>\n",
       "      <td>...</td>\n",
       "      <td>0.709216</td>\n",
       "      <td>2.193620</td>\n",
       "      <td>1.902530</td>\n",
       "      <td>0.931155</td>\n",
       "      <td>6.259504</td>\n",
       "      <td>0.511885</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.877193</td>\n",
       "      <td>3.507173</td>\n",
       "      <td>1.869258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95718</th>\n",
       "      <td>hg19::chr8:146016823..146016858,-;hg_95720.1</td>\n",
       "      <td>p4@RPL8</td>\n",
       "      <td>CAGE_peak_4_at_RPL8_5end</td>\n",
       "      <td>-443bp_to_BC000047_5end</td>\n",
       "      <td>6132</td>\n",
       "      <td>HGNC:10368</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.649300</td>\n",
       "      <td>4.855642</td>\n",
       "      <td>4.822601</td>\n",
       "      <td>...</td>\n",
       "      <td>0.405266</td>\n",
       "      <td>3.133742</td>\n",
       "      <td>5.707589</td>\n",
       "      <td>3.047416</td>\n",
       "      <td>2.086501</td>\n",
       "      <td>1.023770</td>\n",
       "      <td>2.047713</td>\n",
       "      <td>1.292398</td>\n",
       "      <td>5.041561</td>\n",
       "      <td>4.005552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95719</th>\n",
       "      <td>hg19::chr8:146017737..146017748,-;hg_95721.1</td>\n",
       "      <td>p2@RPL8</td>\n",
       "      <td>CAGE_peak_2_at_RPL8_5end</td>\n",
       "      <td>2bp_to_ENST00000528957.5,uc064rpz.1_5end</td>\n",
       "      <td>6132</td>\n",
       "      <td>HGNC:10368</td>\n",
       "      <td>P62917</td>\n",
       "      <td>28.246498</td>\n",
       "      <td>28.931535</td>\n",
       "      <td>27.971083</td>\n",
       "      <td>...</td>\n",
       "      <td>15.298796</td>\n",
       "      <td>8.461104</td>\n",
       "      <td>7.610119</td>\n",
       "      <td>9.904102</td>\n",
       "      <td>8.346005</td>\n",
       "      <td>1.706283</td>\n",
       "      <td>6.655068</td>\n",
       "      <td>1.938596</td>\n",
       "      <td>3.945570</td>\n",
       "      <td>10.681473</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 1836 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       00Annotation short_description  \\\n",
       "95717  hg19::chr8:146016750..146016780,-;hg_95719.1           p3@RPL8   \n",
       "95718  hg19::chr8:146016823..146016858,-;hg_95720.1           p4@RPL8   \n",
       "95719  hg19::chr8:146017737..146017748,-;hg_95721.1           p2@RPL8   \n",
       "\n",
       "                    description                 association_with_transcript  \\\n",
       "95717  CAGE_peak_3_at_RPL8_5end  494bp_to_ENST00000534781.1,uc064rpt.1_5end   \n",
       "95718  CAGE_peak_4_at_RPL8_5end                     -443bp_to_BC000047_5end   \n",
       "95719  CAGE_peak_2_at_RPL8_5end    2bp_to_ENST00000528957.5,uc064rpz.1_5end   \n",
       "\n",
       "      entrezgene_id     hgnc_id uniprot_id  \\\n",
       "95717          6132  HGNC:10368        NaN   \n",
       "95718          6132  HGNC:10368        NaN   \n",
       "95719          6132  HGNC:10368     P62917   \n",
       "\n",
       "       tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode  \\\n",
       "95717                                           3.116855                                                    \n",
       "95718                                           5.649300                                                    \n",
       "95719                                          28.246498                                                    \n",
       "\n",
       "       tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode  \\\n",
       "95717                                           3.034776                                                    \n",
       "95718                                           4.855642                                                    \n",
       "95719                                          28.931535                                                    \n",
       "\n",
       "       tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode  \\\n",
       "95717                                           2.700656                                                    \n",
       "95718                                           4.822601                                                    \n",
       "95719                                          27.971083                                                    \n",
       "\n",
       "       ...  \\\n",
       "95717  ...   \n",
       "95718  ...   \n",
       "95719  ...   \n",
       "\n",
       "       tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode  \\\n",
       "95717                                           0.709216                                             \n",
       "95718                                           0.405266                                             \n",
       "95719                                          15.298796                                             \n",
       "\n",
       "       tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode  \\\n",
       "95717                                           2.193620                                              \n",
       "95718                                           3.133742                                              \n",
       "95719                                           8.461104                                              \n",
       "\n",
       "       tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode  \\\n",
       "95717                                           1.902530                                    \n",
       "95718                                           5.707589                                    \n",
       "95719                                           7.610119                                    \n",
       "\n",
       "       tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode  \\\n",
       "95717                                           0.931155                                          \n",
       "95718                                           3.047416                                          \n",
       "95719                                           9.904102                                          \n",
       "\n",
       "       tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode  \\\n",
       "95717                                           6.259504                                  \n",
       "95718                                           2.086501                                  \n",
       "95719                                           8.346005                                  \n",
       "\n",
       "       tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode  \\\n",
       "95717                                           0.511885                       \n",
       "95718                                           1.023770                       \n",
       "95719                                           1.706283                       \n",
       "\n",
       "       tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode  \\\n",
       "95717                                           0.000000                        \n",
       "95718                                           2.047713                        \n",
       "95719                                           6.655068                        \n",
       "\n",
       "       tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode  \\\n",
       "95717                                           3.877193            \n",
       "95718                                           1.292398            \n",
       "95719                                           1.938596            \n",
       "\n",
       "       tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode  \\\n",
       "95717                                           3.507173          \n",
       "95718                                           5.041561          \n",
       "95719                                           3.945570          \n",
       "\n",
       "       tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode  \n",
       "95717                                           1.869258                                             \n",
       "95718                                           4.005552                                             \n",
       "95719                                          10.681473                                             \n",
       "\n",
       "[3 rows x 1836 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fantom_data[95717: 95720]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}