Retrieving HG38 files metadata.ipynb
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Retrieving HG38 epigenomic files\n",
"The following notebook shows how the epigenomic files metadata are retrieved."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from glob import glob\n",
"import pandas as pd\n",
"import compress_json\n",
"from encodeproject import biosamples, accessions, biosample, download_urls"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We specify that we are only interested in the [GRCh38](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/) assembly, are currently in status [released](https://www.encodeproject.org/help/getting-started/status-terms/#FileStatuses), have replication type [isogenic](https://www.encodeproject.org/data-standards/terms/) (there is a biological replication) and the file format is [bigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html#:~:text=The%20bigWig%20format%20is%20useful,in%20an%20indexed%20binary%20format.&text=Wiggle%20data%20must%20be%20continuous%20and%20consist%20of%20equally%20sized%20elements.)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"parameters = dict(\n",
" assembly=\"GRCh38\",\n",
" replication_type=\"isogenic\",\n",
" file_format=\"bigWig\",\n",
" status=\"released\",\n",
" use_multiprocessing=False\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will append all the dataset while we obtain them to the following list."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"all_datasets = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieving CHIP-seq"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Retrieving biosamples: 0%| | 0/1836 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>TCFL5</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF522WAN</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>ZNF75A</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF651OZY</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>ZBTB17</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF334KTM</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>TFE3</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF808TLR</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>ZNF586</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF977XAH</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2377</th>\n",
" <td>human</td>\n",
" <td>ELF1</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF036DKA</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/10...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2378</th>\n",
" <td>human</td>\n",
" <td>GTF2F1</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF273OEJ</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/10...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2379</th>\n",
" <td>human</td>\n",
" <td>ATF7</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF204NFC</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/10...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2380</th>\n",
" <td>human</td>\n",
" <td>CHD1</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF204LFY</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 3]</td>\n",
" <td>[1_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/10...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2381</th>\n",
" <td>human</td>\n",
" <td>SPI1</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF289XSX</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2, 3]</td>\n",
" <td>[2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2019/02...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2382 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line \\\n",
"0 human TCFL5 EFO:0002067 K562 \n",
"1 human ZNF75A EFO:0002067 K562 \n",
"2 human ZBTB17 EFO:0002067 K562 \n",
"3 human TFE3 EFO:0002067 K562 \n",
"4 human ZNF586 EFO:0002067 K562 \n",
"... ... ... ... ... \n",
"2377 human ELF1 EFO:0002067 K562 \n",
"2378 human GTF2F1 EFO:0001203 MCF-7 \n",
"2379 human ATF7 EFO:0002067 K562 \n",
"2380 human CHD1 EFO:0001203 MCF-7 \n",
"2381 human SPI1 EFO:0002784 GM12878 \n",
"\n",
" institute_name title \\\n",
"0 Stanford University Michael Snyder, Stanford \n",
"1 Stanford University Michael Snyder, Stanford \n",
"2 Stanford University Michael Snyder, Stanford \n",
"3 Stanford University Michael Snyder, Stanford \n",
"4 Stanford University Michael Snyder, Stanford \n",
"... ... ... \n",
"2377 Stanford University Michael Snyder, Stanford \n",
"2378 Stanford University Michael Snyder, Stanford \n",
"2379 Stanford University Michael Snyder, Stanford \n",
"2380 Stanford University Michael Snyder, Stanford \n",
"2381 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB \n",
"\n",
" accession status assay_title assay_term_name ... output_category \\\n",
"0 ENCFF522WAN released TF ChIP-seq ChIP-seq ... signal \n",
"1 ENCFF651OZY released TF ChIP-seq ChIP-seq ... signal \n",
"2 ENCFF334KTM released TF ChIP-seq ChIP-seq ... signal \n",
"3 ENCFF808TLR released TF ChIP-seq ChIP-seq ... signal \n",
"4 ENCFF977XAH released TF ChIP-seq ChIP-seq ... signal \n",
"... ... ... ... ... ... ... \n",
"2377 ENCFF036DKA released TF ChIP-seq ChIP-seq ... signal \n",
"2378 ENCFF273OEJ released TF ChIP-seq ChIP-seq ... signal \n",
"2379 ENCFF204NFC released TF ChIP-seq ChIP-seq ... signal \n",
"2380 ENCFF204LFY released TF ChIP-seq ChIP-seq ... signal \n",
"2381 ENCFF289XSX released TF ChIP-seq ChIP-seq ... signal \n",
"\n",
" output_type read_length read_length_units run_type \\\n",
"0 fold change over control NaN None None \n",
"1 fold change over control NaN None None \n",
"2 fold change over control NaN None None \n",
"3 fold change over control NaN None None \n",
"4 fold change over control NaN None None \n",
"... ... ... ... ... \n",
"2377 fold change over control NaN None None \n",
"2378 fold change over control NaN None None \n",
"2379 fold change over control NaN None None \n",
"2380 fold change over control NaN None None \n",
"2381 fold change over control NaN None None \n",
"\n",
" schema_version encode_version biological_replicates \\\n",
"0 26 4.0 [1, 2] \n",
"1 26 4.0 [1, 2] \n",
"2 26 4.0 [1, 2] \n",
"3 26 4.0 [1, 2] \n",
"4 26 4.0 [1, 2] \n",
"... ... ... ... \n",
"2377 26 3.0 [1, 2] \n",
"2378 26 3.0 [1, 2] \n",
"2379 26 3.0 [1, 2] \n",
"2380 26 3.0 [1, 3] \n",
"2381 26 3.0 [2, 3] \n",
"\n",
" technical_replicates url \n",
"0 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/11... \n",
"1 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/07... \n",
"2 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/07... \n",
"3 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/07... \n",
"4 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/07... \n",
"... ... ... \n",
"2377 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/10... \n",
"2378 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/10... \n",
"2379 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/10... \n",
"2380 [1_1, 3_1] https://encode-public.s3.amazonaws.com/2016/10... \n",
"2381 [2_1, 3_1] https://encode-public.s3.amazonaws.com/2019/02... \n",
"\n",
"[2382 rows x 26 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples = biosamples(\n",
" accessions=accessions(compress_json.load(\"hg38_encode_queries/chipseq.json\")),\n",
" min_biological_replicates=2,\n",
" output_type=\"fold change over control\",\n",
" **parameters\n",
")\n",
"all_datasets.append(samples)\n",
"samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieving DNASE-seq"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Retrieving biosamples: 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF113VII</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF577SOF</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/08...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF842XRQ</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/08...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF546MZK</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF413AHU</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1, 1_2]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2018/05...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF936BDN</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2018/05...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF414OGC</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1, 1_2]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF338LXW</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF924FJR</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/08...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF949ANK</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/08...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF631QXL</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF134COA</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF280JCA</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF688CJL</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_7]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF985FHV</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_7]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF541BTE</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF915DFR</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/08...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF743ULW</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/08...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF960FMM</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF428XFI</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line institute_name \\\n",
"0 human Unknown EFO:0001187 HepG2 University of Washington \n",
"1 human Unknown EFO:0001187 HepG2 University of Washington \n",
"2 human Unknown EFO:0001187 HepG2 University of Washington \n",
"3 human Unknown EFO:0001187 HepG2 University of Washington \n",
"4 human Unknown EFO:0002067 K562 University of Washington \n",
"5 human Unknown EFO:0002067 K562 University of Washington \n",
"6 human Unknown EFO:0002067 K562 University of Washington \n",
"7 human Unknown EFO:0002067 K562 University of Washington \n",
"8 human Unknown EFO:0001203 MCF-7 University of Washington \n",
"9 human Unknown EFO:0001203 MCF-7 University of Washington \n",
"10 human Unknown EFO:0001203 MCF-7 University of Washington \n",
"11 human Unknown EFO:0001203 MCF-7 University of Washington \n",
"12 human Unknown EFO:0001086 A549 University of Washington \n",
"13 human Unknown EFO:0001086 A549 University of Washington \n",
"14 human Unknown EFO:0001086 A549 University of Washington \n",
"15 human Unknown EFO:0001086 A549 University of Washington \n",
"16 human Unknown EFO:0002784 GM12878 University of Washington \n",
"17 human Unknown EFO:0002784 GM12878 University of Washington \n",
"18 human Unknown EFO:0002784 GM12878 University of Washington \n",
"19 human Unknown EFO:0002784 GM12878 University of Washington \n",
"\n",
" title accession status assay_title \\\n",
"0 John Stamatoyannopoulos, UW ENCFF113VII released DNase-seq \n",
"1 John Stamatoyannopoulos, UW ENCFF577SOF released DNase-seq \n",
"2 John Stamatoyannopoulos, UW ENCFF842XRQ released DNase-seq \n",
"3 John Stamatoyannopoulos, UW ENCFF546MZK released DNase-seq \n",
"4 John Stamatoyannopoulos, UW ENCFF413AHU released DNase-seq \n",
"5 John Stamatoyannopoulos, UW ENCFF936BDN released DNase-seq \n",
"6 John Stamatoyannopoulos, UW ENCFF414OGC released DNase-seq \n",
"7 John Stamatoyannopoulos, UW ENCFF338LXW released DNase-seq \n",
"8 John Stamatoyannopoulos, UW ENCFF924FJR released DNase-seq \n",
"9 John Stamatoyannopoulos, UW ENCFF949ANK released DNase-seq \n",
"10 John Stamatoyannopoulos, UW ENCFF631QXL released DNase-seq \n",
"11 John Stamatoyannopoulos, UW ENCFF134COA released DNase-seq \n",
"12 John Stamatoyannopoulos, UW ENCFF280JCA released DNase-seq \n",
"13 John Stamatoyannopoulos, UW ENCFF688CJL released DNase-seq \n",
"14 John Stamatoyannopoulos, UW ENCFF985FHV released DNase-seq \n",
"15 John Stamatoyannopoulos, UW ENCFF541BTE released DNase-seq \n",
"16 John Stamatoyannopoulos, UW ENCFF915DFR released DNase-seq \n",
"17 John Stamatoyannopoulos, UW ENCFF743ULW released DNase-seq \n",
"18 John Stamatoyannopoulos, UW ENCFF960FMM released DNase-seq \n",
"19 John Stamatoyannopoulos, UW ENCFF428XFI released DNase-seq \n",
"\n",
" assay_term_name ... output_category output_type \\\n",
"0 DNase-seq ... signal read-depth normalized signal \n",
"1 DNase-seq ... signal read-depth normalized signal \n",
"2 DNase-seq ... signal read-depth normalized signal \n",
"3 DNase-seq ... signal read-depth normalized signal \n",
"4 DNase-seq ... signal read-depth normalized signal \n",
"5 DNase-seq ... signal read-depth normalized signal \n",
"6 DNase-seq ... signal read-depth normalized signal \n",
"7 DNase-seq ... signal read-depth normalized signal \n",
"8 DNase-seq ... signal read-depth normalized signal \n",
"9 DNase-seq ... signal read-depth normalized signal \n",
"10 DNase-seq ... signal read-depth normalized signal \n",
"11 DNase-seq ... signal read-depth normalized signal \n",
"12 DNase-seq ... signal read-depth normalized signal \n",
"13 DNase-seq ... signal read-depth normalized signal \n",
"14 DNase-seq ... signal read-depth normalized signal \n",
"15 DNase-seq ... signal read-depth normalized signal \n",
"16 DNase-seq ... signal read-depth normalized signal \n",
"17 DNase-seq ... signal read-depth normalized signal \n",
"18 DNase-seq ... signal read-depth normalized signal \n",
"19 DNase-seq ... signal read-depth normalized signal \n",
"\n",
" read_length read_length_units run_type schema_version encode_version \\\n",
"0 NaN None None 26 4.0 \n",
"1 NaN None None 26 3.0 \n",
"2 NaN None None 26 3.0 \n",
"3 NaN None None 26 4.0 \n",
"4 NaN None None 26 3.0 \n",
"5 NaN None None 26 3.0 \n",
"6 NaN None None 26 4.0 \n",
"7 NaN None None 26 4.0 \n",
"8 NaN None None 26 3.0 \n",
"9 NaN None None 26 3.0 \n",
"10 NaN None None 26 4.0 \n",
"11 NaN None None 26 4.0 \n",
"12 NaN None None 26 3.0 \n",
"13 NaN None None 26 3.0 \n",
"14 NaN None None 26 4.0 \n",
"15 NaN None None 26 4.0 \n",
"16 NaN None None 26 3.0 \n",
"17 NaN None None 26 3.0 \n",
"18 NaN None None 26 4.0 \n",
"19 NaN None None 26 4.0 \n",
"\n",
" biological_replicates technical_replicates \\\n",
"0 [2] [2_1] \n",
"1 [1] [1_1] \n",
"2 [2] [2_1] \n",
"3 [1] [1_1] \n",
"4 [1] [1_1, 1_2] \n",
"5 [2] [2_1] \n",
"6 [1] [1_1, 1_2] \n",
"7 [2] [2_1] \n",
"8 [1] [1_1] \n",
"9 [2] [2_1] \n",
"10 [1] [1_1] \n",
"11 [2] [2_1] \n",
"12 [2] [2_1] \n",
"13 [1] [1_7] \n",
"14 [1] [1_7] \n",
"15 [2] [2_1] \n",
"16 [1] [1_1] \n",
"17 [2] [2_1] \n",
"18 [2] [2_1] \n",
"19 [1] [1_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2020/09... \n",
"1 https://encode-public.s3.amazonaws.com/2017/08... \n",
"2 https://encode-public.s3.amazonaws.com/2017/08... \n",
"3 https://encode-public.s3.amazonaws.com/2020/09... \n",
"4 https://encode-public.s3.amazonaws.com/2018/05... \n",
"5 https://encode-public.s3.amazonaws.com/2018/05... \n",
"6 https://encode-public.s3.amazonaws.com/2020/11... \n",
"7 https://encode-public.s3.amazonaws.com/2020/11... \n",
"8 https://encode-public.s3.amazonaws.com/2017/08... \n",
"9 https://encode-public.s3.amazonaws.com/2017/08... \n",
"10 https://encode-public.s3.amazonaws.com/2020/09... \n",
"11 https://encode-public.s3.amazonaws.com/2020/09... \n",
"12 https://encode-public.s3.amazonaws.com/2017/11... \n",
"13 https://encode-public.s3.amazonaws.com/2017/11... \n",
"14 https://encode-public.s3.amazonaws.com/2020/11... \n",
"15 https://encode-public.s3.amazonaws.com/2020/11... \n",
"16 https://encode-public.s3.amazonaws.com/2017/08... \n",
"17 https://encode-public.s3.amazonaws.com/2017/08... \n",
"18 https://encode-public.s3.amazonaws.com/2020/11... \n",
"19 https://encode-public.s3.amazonaws.com/2020/11... \n",
"\n",
"[20 rows x 26 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples = biosamples(\n",
" accessions=accessions(compress_json.load(\"hg38_encode_queries/dnaseseq.json\")),\n",
" organism=None,\n",
" **parameters\n",
")\n",
"samples[\"organism\"] = \"human\"\n",
"\n",
"all_datasets.append(samples)\n",
"samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieving WGBS"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Retrieving biosamples: 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF400QTE</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF583VWF</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF527HXB</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF401QUB</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF796NFQ</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/03...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF812CHG</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/03...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF073DUG</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/03...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF716NKX</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/03...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF872YSC</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/03...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF669KCI</td>\n",
" <td>released</td>\n",
" <td>WGBS</td>\n",
" <td>whole-genome shotgun bisulfite sequencing</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/03...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line \\\n",
"0 human Unknown EFO:0001187 HepG2 \n",
"1 human Unknown EFO:0001187 HepG2 \n",
"2 human Unknown EFO:0001086 A549 \n",
"3 human Unknown EFO:0001086 A549 \n",
"4 human Unknown EFO:0002784 GM12878 \n",
"5 human Unknown EFO:0002784 GM12878 \n",
"6 human Unknown EFO:0001187 HepG2 \n",
"7 human Unknown EFO:0001187 HepG2 \n",
"8 human Unknown EFO:0002067 K562 \n",
"9 human Unknown EFO:0002067 K562 \n",
"\n",
" institute_name title accession \\\n",
"0 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF400QTE \n",
"1 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF583VWF \n",
"2 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF527HXB \n",
"3 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF401QUB \n",
"4 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF796NFQ \n",
"5 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF812CHG \n",
"6 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF073DUG \n",
"7 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF716NKX \n",
"8 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF872YSC \n",
"9 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB ENCFF669KCI \n",
"\n",
" status assay_title assay_term_name ... \\\n",
"0 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"1 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"2 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"3 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"4 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"5 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"6 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"7 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"8 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"9 released WGBS whole-genome shotgun bisulfite sequencing ... \n",
"\n",
" output_category output_type read_length read_length_units run_type \\\n",
"0 signal signal NaN None None \n",
"1 signal signal NaN None None \n",
"2 signal signal NaN None None \n",
"3 signal signal NaN None None \n",
"4 signal signal NaN None None \n",
"5 signal signal NaN None None \n",
"6 signal signal NaN None None \n",
"7 signal signal NaN None None \n",
"8 signal signal NaN None None \n",
"9 signal signal NaN None None \n",
"\n",
" schema_version encode_version biological_replicates technical_replicates \\\n",
"0 26 3 [1] [1_1] \n",
"1 26 3 [2] [2_1] \n",
"2 26 3 [1] [1_1] \n",
"3 26 3 [2] [2_1] \n",
"4 26 3 [1] [1_1] \n",
"5 26 3 [2] [2_1] \n",
"6 26 3 [1] [1_1] \n",
"7 26 3 [2] [2_1] \n",
"8 26 3 [1] [1_1] \n",
"9 26 3 [2] [2_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2017/11... \n",
"1 https://encode-public.s3.amazonaws.com/2017/11... \n",
"2 https://encode-public.s3.amazonaws.com/2017/12... \n",
"3 https://encode-public.s3.amazonaws.com/2017/12... \n",
"4 https://encode-public.s3.amazonaws.com/2016/03... \n",
"5 https://encode-public.s3.amazonaws.com/2016/03... \n",
"6 https://encode-public.s3.amazonaws.com/2016/03... \n",
"7 https://encode-public.s3.amazonaws.com/2016/03... \n",
"8 https://encode-public.s3.amazonaws.com/2016/03... \n",
"9 https://encode-public.s3.amazonaws.com/2016/03... \n",
"\n",
"[10 rows x 26 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples = biosamples(\n",
" accessions=accessions(compress_json.load(\"hg38_encode_queries/wgbs.json\")),\n",
" organism=None,\n",
" **parameters,\n",
")\n",
"# I have manually checked that the version of the files is 3, but it is not available in the metadata.\n",
"samples[\"encode_version\"] = 3\n",
"samples[\"organism\"] = \"human\"\n",
"all_datasets.append(samples)\n",
"samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieving ATAC"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Retrieving biosamples: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF091DNE</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF927YYP</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF831OKQ</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF872HBM</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF948GBJ</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF450XLU</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF225UNA</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>7 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line institute_name \\\n",
"0 human Unknown EFO:0002784 GM12878 Stanford University \n",
"1 human Unknown EFO:0001203 MCF-7 Stanford University \n",
"2 human Unknown EFO:0002067 K562 Stanford University \n",
"3 human Unknown EFO:0001187 HepG2 Stanford University \n",
"4 human Unknown EFO:0002784 GM12878 Stanford University \n",
"5 human Unknown EFO:0002067 K562 Stanford University \n",
"6 human Unknown EFO:0001086 A549 Stanford University \n",
"\n",
" title accession status assay_title \\\n",
"0 Michael Snyder, Stanford ENCFF091DNE released ATAC-seq \n",
"1 Michael Snyder, Stanford ENCFF927YYP released ATAC-seq \n",
"2 Michael Snyder, Stanford ENCFF831OKQ released ATAC-seq \n",
"3 Michael Snyder, Stanford ENCFF872HBM released ATAC-seq \n",
"4 Michael Snyder, Stanford ENCFF948GBJ released ATAC-seq \n",
"5 Michael Snyder, Stanford ENCFF450XLU released ATAC-seq \n",
"6 Michael Snyder, Stanford ENCFF225UNA released ATAC-seq \n",
"\n",
" assay_term_name ... output_category output_type read_length \\\n",
"0 ATAC-seq ... signal fold change over control NaN \n",
"1 ATAC-seq ... signal fold change over control NaN \n",
"2 ATAC-seq ... signal fold change over control NaN \n",
"3 ATAC-seq ... signal fold change over control NaN \n",
"4 ATAC-seq ... signal fold change over control NaN \n",
"5 ATAC-seq ... signal fold change over control NaN \n",
"6 ATAC-seq ... signal fold change over control NaN \n",
"\n",
" read_length_units run_type schema_version encode_version \\\n",
"0 None None 26 4.0 \n",
"1 None None 26 4.0 \n",
"2 None None 26 4.0 \n",
"3 None None 26 4.0 \n",
"4 None None 26 4.0 \n",
"5 None None 26 4.0 \n",
"6 None None 26 4.0 \n",
"\n",
" biological_replicates technical_replicates \\\n",
"0 [1, 2] [1_1, 2_1] \n",
"1 [1, 2] [1_1, 2_1] \n",
"2 [1, 2] [1_1, 2_1] \n",
"3 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"4 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"5 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"6 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2020/07... \n",
"1 https://encode-public.s3.amazonaws.com/2020/07... \n",
"2 https://encode-public.s3.amazonaws.com/2020/07... \n",
"3 https://encode-public.s3.amazonaws.com/2020/07... \n",
"4 https://encode-public.s3.amazonaws.com/2020/07... \n",
"5 https://encode-public.s3.amazonaws.com/2020/07... \n",
"6 https://encode-public.s3.amazonaws.com/2020/07... \n",
"\n",
"[7 rows x 26 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples = biosamples(\n",
" accessions=accessions(compress_json.load(\"hg38_encode_queries/atacseq.json\")),\n",
" organism=None,\n",
" min_biological_replicates=2,\n",
" output_type=\"fold change over control\",\n",
" **parameters\n",
")\n",
"samples[\"organism\"] = \"human\"\n",
"\n",
"all_datasets.append(samples)\n",
"samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Combining all datasets"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>TCFL5</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF522WAN</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>ZNF75A</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF651OZY</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>ZBTB17</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF334KTM</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>TFE3</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF808TLR</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>ZNF586</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF977XAH</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF831OKQ</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF872HBM</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF948GBJ</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF450XLU</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF225UNA</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2419 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line institute_name \\\n",
"0 human TCFL5 EFO:0002067 K562 Stanford University \n",
"1 human ZNF75A EFO:0002067 K562 Stanford University \n",
"2 human ZBTB17 EFO:0002067 K562 Stanford University \n",
"3 human TFE3 EFO:0002067 K562 Stanford University \n",
"4 human ZNF586 EFO:0002067 K562 Stanford University \n",
".. ... ... ... ... ... \n",
"2 human Unknown EFO:0002067 K562 Stanford University \n",
"3 human Unknown EFO:0001187 HepG2 Stanford University \n",
"4 human Unknown EFO:0002784 GM12878 Stanford University \n",
"5 human Unknown EFO:0002067 K562 Stanford University \n",
"6 human Unknown EFO:0001086 A549 Stanford University \n",
"\n",
" title accession status assay_title \\\n",
"0 Michael Snyder, Stanford ENCFF522WAN released TF ChIP-seq \n",
"1 Michael Snyder, Stanford ENCFF651OZY released TF ChIP-seq \n",
"2 Michael Snyder, Stanford ENCFF334KTM released TF ChIP-seq \n",
"3 Michael Snyder, Stanford ENCFF808TLR released TF ChIP-seq \n",
"4 Michael Snyder, Stanford ENCFF977XAH released TF ChIP-seq \n",
".. ... ... ... ... \n",
"2 Michael Snyder, Stanford ENCFF831OKQ released ATAC-seq \n",
"3 Michael Snyder, Stanford ENCFF872HBM released ATAC-seq \n",
"4 Michael Snyder, Stanford ENCFF948GBJ released ATAC-seq \n",
"5 Michael Snyder, Stanford ENCFF450XLU released ATAC-seq \n",
"6 Michael Snyder, Stanford ENCFF225UNA released ATAC-seq \n",
"\n",
" assay_term_name ... output_category output_type read_length \\\n",
"0 ChIP-seq ... signal fold change over control NaN \n",
"1 ChIP-seq ... signal fold change over control NaN \n",
"2 ChIP-seq ... signal fold change over control NaN \n",
"3 ChIP-seq ... signal fold change over control NaN \n",
"4 ChIP-seq ... signal fold change over control NaN \n",
".. ... ... ... ... ... \n",
"2 ATAC-seq ... signal fold change over control NaN \n",
"3 ATAC-seq ... signal fold change over control NaN \n",
"4 ATAC-seq ... signal fold change over control NaN \n",
"5 ATAC-seq ... signal fold change over control NaN \n",
"6 ATAC-seq ... signal fold change over control NaN \n",
"\n",
" read_length_units run_type schema_version encode_version \\\n",
"0 None None 26 4.0 \n",
"1 None None 26 4.0 \n",
"2 None None 26 4.0 \n",
"3 None None 26 4.0 \n",
"4 None None 26 4.0 \n",
".. ... ... ... ... \n",
"2 None None 26 4.0 \n",
"3 None None 26 4.0 \n",
"4 None None 26 4.0 \n",
"5 None None 26 4.0 \n",
"6 None None 26 4.0 \n",
"\n",
" biological_replicates technical_replicates \\\n",
"0 [1, 2] [1_1, 2_1] \n",
"1 [1, 2] [1_1, 2_1] \n",
"2 [1, 2] [1_1, 2_1] \n",
"3 [1, 2] [1_1, 2_1] \n",
"4 [1, 2] [1_1, 2_1] \n",
".. ... ... \n",
"2 [1, 2] [1_1, 2_1] \n",
"3 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"4 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"5 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"6 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2020/11... \n",
"1 https://encode-public.s3.amazonaws.com/2020/07... \n",
"2 https://encode-public.s3.amazonaws.com/2020/07... \n",
"3 https://encode-public.s3.amazonaws.com/2020/07... \n",
"4 https://encode-public.s3.amazonaws.com/2020/07... \n",
".. ... \n",
"2 https://encode-public.s3.amazonaws.com/2020/07... \n",
"3 https://encode-public.s3.amazonaws.com/2020/07... \n",
"4 https://encode-public.s3.amazonaws.com/2020/07... \n",
"5 https://encode-public.s3.amazonaws.com/2020/07... \n",
"6 https://encode-public.s3.amazonaws.com/2020/07... \n",
"\n",
"[2419 rows x 26 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined = pd.concat(all_datasets)\n",
"combined"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>TCFL5</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF522WAN</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>ZNF75A</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF651OZY</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>ZBTB17</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF334KTM</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>TFE3</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF808TLR</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>ZNF586</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF977XAH</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF831OKQ</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001187</td>\n",
" <td>HepG2</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF872HBM</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF948GBJ</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF450XLU</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF225UNA</td>\n",
" <td>released</td>\n",
" <td>ATAC-seq</td>\n",
" <td>ATAC-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2, 3]</td>\n",
" <td>[1_1, 2_1, 3_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2419 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line institute_name \\\n",
"0 human TCFL5 EFO:0002067 K562 Stanford University \n",
"1 human ZNF75A EFO:0002067 K562 Stanford University \n",
"2 human ZBTB17 EFO:0002067 K562 Stanford University \n",
"3 human TFE3 EFO:0002067 K562 Stanford University \n",
"4 human ZNF586 EFO:0002067 K562 Stanford University \n",
".. ... ... ... ... ... \n",
"2 human Unknown EFO:0002067 K562 Stanford University \n",
"3 human Unknown EFO:0001187 HepG2 Stanford University \n",
"4 human Unknown EFO:0002784 GM12878 Stanford University \n",
"5 human Unknown EFO:0002067 K562 Stanford University \n",
"6 human Unknown EFO:0001086 A549 Stanford University \n",
"\n",
" title accession status assay_title \\\n",
"0 Michael Snyder, Stanford ENCFF522WAN released TF ChIP-seq \n",
"1 Michael Snyder, Stanford ENCFF651OZY released TF ChIP-seq \n",
"2 Michael Snyder, Stanford ENCFF334KTM released TF ChIP-seq \n",
"3 Michael Snyder, Stanford ENCFF808TLR released TF ChIP-seq \n",
"4 Michael Snyder, Stanford ENCFF977XAH released TF ChIP-seq \n",
".. ... ... ... ... \n",
"2 Michael Snyder, Stanford ENCFF831OKQ released ATAC-seq \n",
"3 Michael Snyder, Stanford ENCFF872HBM released ATAC-seq \n",
"4 Michael Snyder, Stanford ENCFF948GBJ released ATAC-seq \n",
"5 Michael Snyder, Stanford ENCFF450XLU released ATAC-seq \n",
"6 Michael Snyder, Stanford ENCFF225UNA released ATAC-seq \n",
"\n",
" assay_term_name ... output_category output_type read_length \\\n",
"0 ChIP-seq ... signal fold change over control NaN \n",
"1 ChIP-seq ... signal fold change over control NaN \n",
"2 ChIP-seq ... signal fold change over control NaN \n",
"3 ChIP-seq ... signal fold change over control NaN \n",
"4 ChIP-seq ... signal fold change over control NaN \n",
".. ... ... ... ... ... \n",
"2 ATAC-seq ... signal fold change over control NaN \n",
"3 ATAC-seq ... signal fold change over control NaN \n",
"4 ATAC-seq ... signal fold change over control NaN \n",
"5 ATAC-seq ... signal fold change over control NaN \n",
"6 ATAC-seq ... signal fold change over control NaN \n",
"\n",
" read_length_units run_type schema_version encode_version \\\n",
"0 None None 26 4.0 \n",
"1 None None 26 4.0 \n",
"2 None None 26 4.0 \n",
"3 None None 26 4.0 \n",
"4 None None 26 4.0 \n",
".. ... ... ... ... \n",
"2 None None 26 4.0 \n",
"3 None None 26 4.0 \n",
"4 None None 26 4.0 \n",
"5 None None 26 4.0 \n",
"6 None None 26 4.0 \n",
"\n",
" biological_replicates technical_replicates \\\n",
"0 [1, 2] [1_1, 2_1] \n",
"1 [1, 2] [1_1, 2_1] \n",
"2 [1, 2] [1_1, 2_1] \n",
"3 [1, 2] [1_1, 2_1] \n",
"4 [1, 2] [1_1, 2_1] \n",
".. ... ... \n",
"2 [1, 2] [1_1, 2_1] \n",
"3 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"4 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"5 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"6 [1, 2, 3] [1_1, 2_1, 3_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2020/11... \n",
"1 https://encode-public.s3.amazonaws.com/2020/07... \n",
"2 https://encode-public.s3.amazonaws.com/2020/07... \n",
"3 https://encode-public.s3.amazonaws.com/2020/07... \n",
"4 https://encode-public.s3.amazonaws.com/2020/07... \n",
".. ... \n",
"2 https://encode-public.s3.amazonaws.com/2020/07... \n",
"3 https://encode-public.s3.amazonaws.com/2020/07... \n",
"4 https://encode-public.s3.amazonaws.com/2020/07... \n",
"5 https://encode-public.s3.amazonaws.com/2020/07... \n",
"6 https://encode-public.s3.amazonaws.com/2020/07... \n",
"\n",
"[2419 rows x 26 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Keeping only latest encode version of each file"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"combined[\"string_biological_replicates\"] = combined[\"biological_replicates\"].astype(str)\n",
"filtered_combined = combined.sort_values(\"encode_version\").groupby([\n",
" \"target\",\n",
" \"cell_line\",\n",
" \"assay_title\",\n",
" \"institute_name\",\n",
" \"string_biological_replicates\"\n",
"]).last().reset_index()\n",
"\n",
"filtered_combined[\"url\"] = [\n",
" url.replace(\"private\", \"public\")\n",
" for url in filtered_combined.url\n",
"]\n",
"\n",
"filtered_combined.to_csv(\"epigenomic_dataset/epigenomes_metadata/hg38.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>target</th>\n",
" <th>cell_line</th>\n",
" <th>assay_title</th>\n",
" <th>institute_name</th>\n",
" <th>string_biological_replicates</th>\n",
" <th>organism</th>\n",
" <th>term_id</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ADNP</td>\n",
" <td>HepG2</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001187</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF290ZNR</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ADNP</td>\n",
" <td>K562</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>University of Chicago</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0002067</td>\n",
" <td>Kevin White, UChicago</td>\n",
" <td>ENCFF340UTP</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/10...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AEBP2</td>\n",
" <td>HEK293</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001182</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF627OLR</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/08...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AFF1</td>\n",
" <td>K562</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0002067</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF003ZRP</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AFF4</td>\n",
" <td>HepG2</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001187</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF274ASN</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1724</th>\n",
" <td>ZXDB</td>\n",
" <td>HEK293</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001182</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF741ADU</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/08...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1725</th>\n",
" <td>ZXDC</td>\n",
" <td>HepG2</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001187</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF805VXG</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/06...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1726</th>\n",
" <td>ZZZ3</td>\n",
" <td>GM12878</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Harvard Medical School</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0002784</td>\n",
" <td>Kevin Struhl, HMS</td>\n",
" <td>ENCFF636KMF</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1727</th>\n",
" <td>ZZZ3</td>\n",
" <td>HepG2</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001187</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF599UAF</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1728</th>\n",
" <td>ZZZ3</td>\n",
" <td>K562</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0002067</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF481FFX</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>4.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2020/12...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1729 rows × 27 columns</p>\n",
"</div>"
],
"text/plain": [
" target cell_line assay_title institute_name \\\n",
"0 ADNP HepG2 TF ChIP-seq HudsonAlpha Institute for Biotechnology \n",
"1 ADNP K562 TF ChIP-seq University of Chicago \n",
"2 AEBP2 HEK293 TF ChIP-seq Stanford University \n",
"3 AFF1 K562 TF ChIP-seq Stanford University \n",
"4 AFF4 HepG2 TF ChIP-seq HudsonAlpha Institute for Biotechnology \n",
"... ... ... ... ... \n",
"1724 ZXDB HEK293 TF ChIP-seq Stanford University \n",
"1725 ZXDC HepG2 TF ChIP-seq HudsonAlpha Institute for Biotechnology \n",
"1726 ZZZ3 GM12878 TF ChIP-seq Harvard Medical School \n",
"1727 ZZZ3 HepG2 TF ChIP-seq HudsonAlpha Institute for Biotechnology \n",
"1728 ZZZ3 K562 TF ChIP-seq Stanford University \n",
"\n",
" string_biological_replicates organism term_id \\\n",
"0 [1, 2] human EFO:0001187 \n",
"1 [1, 2] human EFO:0002067 \n",
"2 [1, 2] human EFO:0001182 \n",
"3 [1, 2] human EFO:0002067 \n",
"4 [1, 2] human EFO:0001187 \n",
"... ... ... ... \n",
"1724 [1, 2] human EFO:0001182 \n",
"1725 [1, 2] human EFO:0001187 \n",
"1726 [1, 2] human EFO:0002784 \n",
"1727 [1, 2] human EFO:0001187 \n",
"1728 [1, 2] human EFO:0002067 \n",
"\n",
" title accession status ... output_category \\\n",
"0 Richard Myers, HAIB ENCFF290ZNR released ... signal \n",
"1 Kevin White, UChicago ENCFF340UTP released ... signal \n",
"2 Michael Snyder, Stanford ENCFF627OLR released ... signal \n",
"3 Michael Snyder, Stanford ENCFF003ZRP released ... signal \n",
"4 Richard Myers, HAIB ENCFF274ASN released ... signal \n",
"... ... ... ... ... ... \n",
"1724 Michael Snyder, Stanford ENCFF741ADU released ... signal \n",
"1725 Richard Myers, HAIB ENCFF805VXG released ... signal \n",
"1726 Kevin Struhl, HMS ENCFF636KMF released ... signal \n",
"1727 Richard Myers, HAIB ENCFF599UAF released ... signal \n",
"1728 Michael Snyder, Stanford ENCFF481FFX released ... signal \n",
"\n",
" output_type read_length read_length_units run_type \\\n",
"0 fold change over control NaN None None \n",
"1 fold change over control NaN None None \n",
"2 fold change over control NaN None None \n",
"3 fold change over control NaN None None \n",
"4 fold change over control NaN None None \n",
"... ... ... ... ... \n",
"1724 fold change over control NaN None None \n",
"1725 fold change over control NaN None None \n",
"1726 fold change over control NaN None None \n",
"1727 fold change over control NaN None None \n",
"1728 fold change over control NaN None None \n",
"\n",
" schema_version encode_version biological_replicates \\\n",
"0 26 4.0 [1, 2] \n",
"1 26 3.0 [1, 2] \n",
"2 26 4.0 [1, 2] \n",
"3 26 4.0 [1, 2] \n",
"4 26 4.0 [1, 2] \n",
"... ... ... ... \n",
"1724 26 4.0 [1, 2] \n",
"1725 26 4.0 [1, 2] \n",
"1726 26 3.0 [1, 2] \n",
"1727 26 4.0 [1, 2] \n",
"1728 26 4.0 [1, 2] \n",
"\n",
" technical_replicates url \n",
"0 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/07... \n",
"1 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/10... \n",
"2 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/08... \n",
"3 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/11... \n",
"4 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/07... \n",
"... ... ... \n",
"1724 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/08... \n",
"1725 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/06... \n",
"1726 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2017/07... \n",
"1727 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/07... \n",
"1728 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2020/12... \n",
"\n",
"[1729 rows x 27 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_combined"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}