Retrieving HG19 files metadata.ipynb
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Retrieving HG19 epigenomic files\n",
"The following notebook shows how the epigenomic files metadata are retrieved."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from glob import glob\n",
"import pandas as pd\n",
"import compress_json\n",
"from encodeproject import biosamples, accessions, biosample, download_urls"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We specify that we are only interested in the [GRCh38](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/) assembly, are currently in status [released](https://www.encodeproject.org/help/getting-started/status-terms/#FileStatuses), have replication type [isogenic](https://www.encodeproject.org/data-standards/terms/) (there is a biological replication) and the file format is [bigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html#:~:text=The%20bigWig%20format%20is%20useful,in%20an%20indexed%20binary%20format.&text=Wiggle%20data%20must%20be%20continuous%20and%20consist%20of%20equally%20sized%20elements.)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"parameters = dict(\n",
" assembly=\"hg19\",\n",
" replication_type=\"isogenic\",\n",
" file_format=\"bigWig\",\n",
" status=\"released\",\n",
" use_multiprocessing=True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will append all the dataset while we obtain them to the following list."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"all_datasets = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieving CHIP-seq"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Retrieving biosamples', layout=Layout(flex='2'), max=1579…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>H3K4me2</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>Broad Institute</td>\n",
" <td>Bradley Bernstein, Broad</td>\n",
" <td>ENCFF998NCA</td>\n",
" <td>released</td>\n",
" <td>Histone ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 1_2, 2_1, 2_2, 2_3]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>SIX5</td>\n",
" <td>EFO:0003042</td>\n",
" <td>H1</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF095XBB</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>HCFC1</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF633TLX</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>MYC</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF790FHL</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>ZNF274</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>University of Southern California</td>\n",
" <td>Peggy Farnham, USC</td>\n",
" <td>ENCFF296ZAW</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/02...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1106</th>\n",
" <td>human</td>\n",
" <td>POLR2AphosphoS5</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF002UPS</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/06...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1107</th>\n",
" <td>human</td>\n",
" <td>EP300</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF820BXH</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/07...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1108</th>\n",
" <td>human</td>\n",
" <td>CTCF</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF933ZLL</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/04...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1109</th>\n",
" <td>human</td>\n",
" <td>POLR2A</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF368HBX</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/01...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1110</th>\n",
" <td>human</td>\n",
" <td>POLR2A</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF647MSS</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/09...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1111 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line \\\n",
"0 human H3K4me2 EFO:0001203 MCF-7 \n",
"1 human SIX5 EFO:0003042 H1 \n",
"2 human HCFC1 EFO:0002067 K562 \n",
"3 human MYC EFO:0002067 K562 \n",
"4 human ZNF274 EFO:0002067 K562 \n",
"... ... ... ... ... \n",
"1106 human POLR2AphosphoS5 EFO:0002784 GM12878 \n",
"1107 human EP300 EFO:0002784 GM12878 \n",
"1108 human CTCF EFO:0002067 K562 \n",
"1109 human POLR2A EFO:0002784 GM12878 \n",
"1110 human POLR2A EFO:0002067 K562 \n",
"\n",
" institute_name title \\\n",
"0 Broad Institute Bradley Bernstein, Broad \n",
"1 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB \n",
"2 Stanford University Michael Snyder, Stanford \n",
"3 Stanford University Michael Snyder, Stanford \n",
"4 University of Southern California Peggy Farnham, USC \n",
"... ... ... \n",
"1106 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB \n",
"1107 Stanford University Michael Snyder, Stanford \n",
"1108 Stanford University Michael Snyder, Stanford \n",
"1109 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB \n",
"1110 Stanford University Michael Snyder, Stanford \n",
"\n",
" accession status assay_title assay_term_name ... \\\n",
"0 ENCFF998NCA released Histone ChIP-seq ChIP-seq ... \n",
"1 ENCFF095XBB released TF ChIP-seq ChIP-seq ... \n",
"2 ENCFF633TLX released TF ChIP-seq ChIP-seq ... \n",
"3 ENCFF790FHL released TF ChIP-seq ChIP-seq ... \n",
"4 ENCFF296ZAW released TF ChIP-seq ChIP-seq ... \n",
"... ... ... ... ... ... \n",
"1106 ENCFF002UPS released TF ChIP-seq ChIP-seq ... \n",
"1107 ENCFF820BXH released TF ChIP-seq ChIP-seq ... \n",
"1108 ENCFF933ZLL released TF ChIP-seq ChIP-seq ... \n",
"1109 ENCFF368HBX released TF ChIP-seq ChIP-seq ... \n",
"1110 ENCFF647MSS released TF ChIP-seq ChIP-seq ... \n",
"\n",
" output_category output_type read_length read_length_units \\\n",
"0 signal fold change over control NaN None \n",
"1 signal fold change over control NaN None \n",
"2 signal fold change over control NaN None \n",
"3 signal fold change over control NaN None \n",
"4 signal fold change over control NaN None \n",
"... ... ... ... ... \n",
"1106 signal fold change over control NaN None \n",
"1107 signal fold change over control NaN None \n",
"1108 signal fold change over control NaN None \n",
"1109 signal fold change over control NaN None \n",
"1110 signal fold change over control NaN None \n",
"\n",
" run_type schema_version encode_version biological_replicates \\\n",
"0 None 26 3.0 [1, 2] \n",
"1 None 26 3.0 [1, 2] \n",
"2 None 26 3.0 [1, 2] \n",
"3 None 26 3.0 [1, 2] \n",
"4 None 26 3.0 [1, 2] \n",
"... ... ... ... ... \n",
"1106 None 26 3.0 [1, 2] \n",
"1107 None 26 3.0 [1, 2] \n",
"1108 None 26 3.0 [1, 2] \n",
"1109 None 26 3.0 [1, 2] \n",
"1110 None 26 3.0 [1, 2] \n",
"\n",
" technical_replicates \\\n",
"0 [1_1, 1_2, 2_1, 2_2, 2_3] \n",
"1 [1_1, 2_1] \n",
"2 [1_1, 2_1] \n",
"3 [1_1, 2_1] \n",
"4 [1_1, 2_1] \n",
"... ... \n",
"1106 [1_1, 2_1] \n",
"1107 [1_1, 2_1] \n",
"1108 [1_1, 2_1] \n",
"1109 [1_1, 2_1] \n",
"1110 [1_1, 2_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2016/11... \n",
"1 https://encode-public.s3.amazonaws.com/2017/12... \n",
"2 https://encode-public.s3.amazonaws.com/2016/12... \n",
"3 https://encode-public.s3.amazonaws.com/2017/12... \n",
"4 https://encode-public.s3.amazonaws.com/2017/02... \n",
"... ... \n",
"1106 https://encode-public.s3.amazonaws.com/2017/06... \n",
"1107 https://encode-public.s3.amazonaws.com/2017/07... \n",
"1108 https://encode-public.s3.amazonaws.com/2017/04... \n",
"1109 https://encode-public.s3.amazonaws.com/2017/01... \n",
"1110 https://encode-public.s3.amazonaws.com/2016/09... \n",
"\n",
"[1111 rows x 26 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples = biosamples(\n",
" accessions=accessions(compress_json.load(\"hg38_encode_queries/chipseq.json\")),\n",
" min_biological_replicates=2,\n",
" output_type=\"fold change over control\",\n",
" **parameters\n",
")\n",
"all_datasets.append(samples)\n",
"samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieving DNASE-seq"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Retrieving biosamples', layout=Layout(flex='2'), max=5.0,…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF615FRD</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF922TLC</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF180FXV</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF723TWJ</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_7]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF901GZH</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF264NMW</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line institute_name \\\n",
"0 human Unknown EFO:0001203 MCF-7 University of Washington \n",
"1 human Unknown EFO:0001203 MCF-7 University of Washington \n",
"2 human Unknown EFO:0001086 A549 University of Washington \n",
"3 human Unknown EFO:0001086 A549 University of Washington \n",
"4 human Unknown EFO:0002784 GM12878 University of Washington \n",
"5 human Unknown EFO:0002784 GM12878 University of Washington \n",
"\n",
" title accession status assay_title \\\n",
"0 John Stamatoyannopoulos, UW ENCFF615FRD released DNase-seq \n",
"1 John Stamatoyannopoulos, UW ENCFF922TLC released DNase-seq \n",
"2 John Stamatoyannopoulos, UW ENCFF180FXV released DNase-seq \n",
"3 John Stamatoyannopoulos, UW ENCFF723TWJ released DNase-seq \n",
"4 John Stamatoyannopoulos, UW ENCFF901GZH released DNase-seq \n",
"5 John Stamatoyannopoulos, UW ENCFF264NMW released DNase-seq \n",
"\n",
" assay_term_name ... output_category output_type \\\n",
"0 DNase-seq ... signal read-depth normalized signal \n",
"1 DNase-seq ... signal read-depth normalized signal \n",
"2 DNase-seq ... signal read-depth normalized signal \n",
"3 DNase-seq ... signal read-depth normalized signal \n",
"4 DNase-seq ... signal read-depth normalized signal \n",
"5 DNase-seq ... signal read-depth normalized signal \n",
"\n",
" read_length read_length_units run_type schema_version encode_version \\\n",
"0 NaN None None 26 3.0 \n",
"1 NaN None None 26 3.0 \n",
"2 NaN None None 26 3.0 \n",
"3 NaN None None 26 3.0 \n",
"4 NaN None None 26 3.0 \n",
"5 NaN None None 26 3.0 \n",
"\n",
" biological_replicates technical_replicates \\\n",
"0 [2] [2_1] \n",
"1 [1] [1_1] \n",
"2 [2] [2_1] \n",
"3 [1] [1_7] \n",
"4 [2] [2_1] \n",
"5 [1] [1_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2017/09... \n",
"1 https://encode-public.s3.amazonaws.com/2017/09... \n",
"2 https://encode-public.s3.amazonaws.com/2017/12... \n",
"3 https://encode-public.s3.amazonaws.com/2017/12... \n",
"4 https://encode-public.s3.amazonaws.com/2017/09... \n",
"5 https://encode-public.s3.amazonaws.com/2017/09... \n",
"\n",
"[6 rows x 26 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples = biosamples(\n",
" accessions=accessions(compress_json.load(\"hg38_encode_queries/dnaseseq.json\")),\n",
" organism=None,\n",
" **parameters\n",
")\n",
"samples[\"organism\"] = \"human\"\n",
"\n",
"all_datasets.append(samples)\n",
"samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieving FAIRE-seq"
]
},
{
"cell_type": "raw",
"metadata": {
"scrolled": true
},
"source": [
"samples = biosamples(\n",
" accessions=accessions(compress_json.load(\"hg38_encode_queries/faireseq.json\")),\n",
" organism=None,\n",
" **parameters,\n",
")\n",
"# I have manually checked that the version of the files is 3, but it is not available in the metadata.\n",
"samples[\"encode_version\"] = 3\n",
"samples[\"organism\"] = \"human\"\n",
"all_datasets.append(samples)\n",
"samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieving DNAME"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Retrieving biosamples', layout=Layout(flex='2'), max=41.0…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [organism, target, term_id, cell_line, institute_name, title, accession, status, assay_title, assay_term_name, replication_type, date_released, assembly, file_format, file_size, file_type, output_category, output_type, read_length, read_length_units, run_type, schema_version, encode_version, biological_replicates, technical_replicates, url]\n",
"Index: []\n",
"\n",
"[0 rows x 26 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples = biosamples(\n",
" accessions=accessions(compress_json.load(\"hg38_encode_queries/dname.json\")),\n",
" organism=None,\n",
" min_biological_replicates=0,\n",
" output_type=None,\n",
" **parameters\n",
")\n",
"samples[\"organism\"] = \"human\"\n",
"\n",
"all_datasets.append(samples)\n",
"samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Combining all datasets"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>H3K4me2</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>Broad Institute</td>\n",
" <td>Bradley Bernstein, Broad</td>\n",
" <td>ENCFF998NCA</td>\n",
" <td>released</td>\n",
" <td>Histone ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 1_2, 2_1, 2_2, 2_3]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>SIX5</td>\n",
" <td>EFO:0003042</td>\n",
" <td>H1</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF095XBB</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>HCFC1</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF633TLX</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>MYC</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF790FHL</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>ZNF274</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>University of Southern California</td>\n",
" <td>Peggy Farnham, USC</td>\n",
" <td>ENCFF296ZAW</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/02...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF922TLC</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF180FXV</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF723TWJ</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_7]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF901GZH</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF264NMW</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1117 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line \\\n",
"0 human H3K4me2 EFO:0001203 MCF-7 \n",
"1 human SIX5 EFO:0003042 H1 \n",
"2 human HCFC1 EFO:0002067 K562 \n",
"3 human MYC EFO:0002067 K562 \n",
"4 human ZNF274 EFO:0002067 K562 \n",
".. ... ... ... ... \n",
"1 human Unknown EFO:0001203 MCF-7 \n",
"2 human Unknown EFO:0001086 A549 \n",
"3 human Unknown EFO:0001086 A549 \n",
"4 human Unknown EFO:0002784 GM12878 \n",
"5 human Unknown EFO:0002784 GM12878 \n",
"\n",
" institute_name title \\\n",
"0 Broad Institute Bradley Bernstein, Broad \n",
"1 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB \n",
"2 Stanford University Michael Snyder, Stanford \n",
"3 Stanford University Michael Snyder, Stanford \n",
"4 University of Southern California Peggy Farnham, USC \n",
".. ... ... \n",
"1 University of Washington John Stamatoyannopoulos, UW \n",
"2 University of Washington John Stamatoyannopoulos, UW \n",
"3 University of Washington John Stamatoyannopoulos, UW \n",
"4 University of Washington John Stamatoyannopoulos, UW \n",
"5 University of Washington John Stamatoyannopoulos, UW \n",
"\n",
" accession status assay_title assay_term_name ... \\\n",
"0 ENCFF998NCA released Histone ChIP-seq ChIP-seq ... \n",
"1 ENCFF095XBB released TF ChIP-seq ChIP-seq ... \n",
"2 ENCFF633TLX released TF ChIP-seq ChIP-seq ... \n",
"3 ENCFF790FHL released TF ChIP-seq ChIP-seq ... \n",
"4 ENCFF296ZAW released TF ChIP-seq ChIP-seq ... \n",
".. ... ... ... ... ... \n",
"1 ENCFF922TLC released DNase-seq DNase-seq ... \n",
"2 ENCFF180FXV released DNase-seq DNase-seq ... \n",
"3 ENCFF723TWJ released DNase-seq DNase-seq ... \n",
"4 ENCFF901GZH released DNase-seq DNase-seq ... \n",
"5 ENCFF264NMW released DNase-seq DNase-seq ... \n",
"\n",
" output_category output_type read_length \\\n",
"0 signal fold change over control NaN \n",
"1 signal fold change over control NaN \n",
"2 signal fold change over control NaN \n",
"3 signal fold change over control NaN \n",
"4 signal fold change over control NaN \n",
".. ... ... ... \n",
"1 signal read-depth normalized signal NaN \n",
"2 signal read-depth normalized signal NaN \n",
"3 signal read-depth normalized signal NaN \n",
"4 signal read-depth normalized signal NaN \n",
"5 signal read-depth normalized signal NaN \n",
"\n",
" read_length_units run_type schema_version encode_version \\\n",
"0 None None 26 3.0 \n",
"1 None None 26 3.0 \n",
"2 None None 26 3.0 \n",
"3 None None 26 3.0 \n",
"4 None None 26 3.0 \n",
".. ... ... ... ... \n",
"1 None None 26 3.0 \n",
"2 None None 26 3.0 \n",
"3 None None 26 3.0 \n",
"4 None None 26 3.0 \n",
"5 None None 26 3.0 \n",
"\n",
" biological_replicates technical_replicates \\\n",
"0 [1, 2] [1_1, 1_2, 2_1, 2_2, 2_3] \n",
"1 [1, 2] [1_1, 2_1] \n",
"2 [1, 2] [1_1, 2_1] \n",
"3 [1, 2] [1_1, 2_1] \n",
"4 [1, 2] [1_1, 2_1] \n",
".. ... ... \n",
"1 [1] [1_1] \n",
"2 [2] [2_1] \n",
"3 [1] [1_7] \n",
"4 [2] [2_1] \n",
"5 [1] [1_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2016/11... \n",
"1 https://encode-public.s3.amazonaws.com/2017/12... \n",
"2 https://encode-public.s3.amazonaws.com/2016/12... \n",
"3 https://encode-public.s3.amazonaws.com/2017/12... \n",
"4 https://encode-public.s3.amazonaws.com/2017/02... \n",
".. ... \n",
"1 https://encode-public.s3.amazonaws.com/2017/09... \n",
"2 https://encode-public.s3.amazonaws.com/2017/12... \n",
"3 https://encode-public.s3.amazonaws.com/2017/12... \n",
"4 https://encode-public.s3.amazonaws.com/2017/09... \n",
"5 https://encode-public.s3.amazonaws.com/2017/09... \n",
"\n",
"[1117 rows x 26 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined = pd.concat(all_datasets)\n",
"combined"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organism</th>\n",
" <th>target</th>\n",
" <th>term_id</th>\n",
" <th>cell_line</th>\n",
" <th>institute_name</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>assay_title</th>\n",
" <th>assay_term_name</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>human</td>\n",
" <td>H3K4me2</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>Broad Institute</td>\n",
" <td>Bradley Bernstein, Broad</td>\n",
" <td>ENCFF998NCA</td>\n",
" <td>released</td>\n",
" <td>Histone ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 1_2, 2_1, 2_2, 2_3]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>SIX5</td>\n",
" <td>EFO:0003042</td>\n",
" <td>H1</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF095XBB</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>HCFC1</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF633TLX</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>MYC</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>Stanford University</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF790FHL</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>ZNF274</td>\n",
" <td>EFO:0002067</td>\n",
" <td>K562</td>\n",
" <td>University of Southern California</td>\n",
" <td>Peggy Farnham, USC</td>\n",
" <td>ENCFF296ZAW</td>\n",
" <td>released</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>ChIP-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/02...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001203</td>\n",
" <td>MCF-7</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF922TLC</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF180FXV</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0001086</td>\n",
" <td>A549</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF723TWJ</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_7]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/12...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF901GZH</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[2]</td>\n",
" <td>[2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>human</td>\n",
" <td>Unknown</td>\n",
" <td>EFO:0002784</td>\n",
" <td>GM12878</td>\n",
" <td>University of Washington</td>\n",
" <td>John Stamatoyannopoulos, UW</td>\n",
" <td>ENCFF264NMW</td>\n",
" <td>released</td>\n",
" <td>DNase-seq</td>\n",
" <td>DNase-seq</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>read-depth normalized signal</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1]</td>\n",
" <td>[1_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/09...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1117 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" organism target term_id cell_line \\\n",
"0 human H3K4me2 EFO:0001203 MCF-7 \n",
"1 human SIX5 EFO:0003042 H1 \n",
"2 human HCFC1 EFO:0002067 K562 \n",
"3 human MYC EFO:0002067 K562 \n",
"4 human ZNF274 EFO:0002067 K562 \n",
".. ... ... ... ... \n",
"1 human Unknown EFO:0001203 MCF-7 \n",
"2 human Unknown EFO:0001086 A549 \n",
"3 human Unknown EFO:0001086 A549 \n",
"4 human Unknown EFO:0002784 GM12878 \n",
"5 human Unknown EFO:0002784 GM12878 \n",
"\n",
" institute_name title \\\n",
"0 Broad Institute Bradley Bernstein, Broad \n",
"1 HudsonAlpha Institute for Biotechnology Richard Myers, HAIB \n",
"2 Stanford University Michael Snyder, Stanford \n",
"3 Stanford University Michael Snyder, Stanford \n",
"4 University of Southern California Peggy Farnham, USC \n",
".. ... ... \n",
"1 University of Washington John Stamatoyannopoulos, UW \n",
"2 University of Washington John Stamatoyannopoulos, UW \n",
"3 University of Washington John Stamatoyannopoulos, UW \n",
"4 University of Washington John Stamatoyannopoulos, UW \n",
"5 University of Washington John Stamatoyannopoulos, UW \n",
"\n",
" accession status assay_title assay_term_name ... \\\n",
"0 ENCFF998NCA released Histone ChIP-seq ChIP-seq ... \n",
"1 ENCFF095XBB released TF ChIP-seq ChIP-seq ... \n",
"2 ENCFF633TLX released TF ChIP-seq ChIP-seq ... \n",
"3 ENCFF790FHL released TF ChIP-seq ChIP-seq ... \n",
"4 ENCFF296ZAW released TF ChIP-seq ChIP-seq ... \n",
".. ... ... ... ... ... \n",
"1 ENCFF922TLC released DNase-seq DNase-seq ... \n",
"2 ENCFF180FXV released DNase-seq DNase-seq ... \n",
"3 ENCFF723TWJ released DNase-seq DNase-seq ... \n",
"4 ENCFF901GZH released DNase-seq DNase-seq ... \n",
"5 ENCFF264NMW released DNase-seq DNase-seq ... \n",
"\n",
" output_category output_type read_length \\\n",
"0 signal fold change over control NaN \n",
"1 signal fold change over control NaN \n",
"2 signal fold change over control NaN \n",
"3 signal fold change over control NaN \n",
"4 signal fold change over control NaN \n",
".. ... ... ... \n",
"1 signal read-depth normalized signal NaN \n",
"2 signal read-depth normalized signal NaN \n",
"3 signal read-depth normalized signal NaN \n",
"4 signal read-depth normalized signal NaN \n",
"5 signal read-depth normalized signal NaN \n",
"\n",
" read_length_units run_type schema_version encode_version \\\n",
"0 None None 26 3.0 \n",
"1 None None 26 3.0 \n",
"2 None None 26 3.0 \n",
"3 None None 26 3.0 \n",
"4 None None 26 3.0 \n",
".. ... ... ... ... \n",
"1 None None 26 3.0 \n",
"2 None None 26 3.0 \n",
"3 None None 26 3.0 \n",
"4 None None 26 3.0 \n",
"5 None None 26 3.0 \n",
"\n",
" biological_replicates technical_replicates \\\n",
"0 [1, 2] [1_1, 1_2, 2_1, 2_2, 2_3] \n",
"1 [1, 2] [1_1, 2_1] \n",
"2 [1, 2] [1_1, 2_1] \n",
"3 [1, 2] [1_1, 2_1] \n",
"4 [1, 2] [1_1, 2_1] \n",
".. ... ... \n",
"1 [1] [1_1] \n",
"2 [2] [2_1] \n",
"3 [1] [1_7] \n",
"4 [2] [2_1] \n",
"5 [1] [1_1] \n",
"\n",
" url \n",
"0 https://encode-public.s3.amazonaws.com/2016/11... \n",
"1 https://encode-public.s3.amazonaws.com/2017/12... \n",
"2 https://encode-public.s3.amazonaws.com/2016/12... \n",
"3 https://encode-public.s3.amazonaws.com/2017/12... \n",
"4 https://encode-public.s3.amazonaws.com/2017/02... \n",
".. ... \n",
"1 https://encode-public.s3.amazonaws.com/2017/09... \n",
"2 https://encode-public.s3.amazonaws.com/2017/12... \n",
"3 https://encode-public.s3.amazonaws.com/2017/12... \n",
"4 https://encode-public.s3.amazonaws.com/2017/09... \n",
"5 https://encode-public.s3.amazonaws.com/2017/09... \n",
"\n",
"[1117 rows x 26 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Keeping only latest encode version of each file"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"combined[\"string_biological_replicates\"] = combined[\"biological_replicates\"].astype(str)\n",
"filtered_combined = combined.sort_values(\"encode_version\").groupby([\n",
" \"target\",\n",
" \"cell_line\",\n",
" \"assay_title\",\n",
" \"institute_name\",\n",
" \"string_biological_replicates\"\n",
"]).last().reset_index()\n",
"\n",
"filtered_combined.to_csv(\"epigenomic_dataset/epigenomes_metadata/hg19.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>target</th>\n",
" <th>cell_line</th>\n",
" <th>assay_title</th>\n",
" <th>institute_name</th>\n",
" <th>string_biological_replicates</th>\n",
" <th>organism</th>\n",
" <th>term_id</th>\n",
" <th>title</th>\n",
" <th>accession</th>\n",
" <th>status</th>\n",
" <th>...</th>\n",
" <th>output_category</th>\n",
" <th>output_type</th>\n",
" <th>read_length</th>\n",
" <th>read_length_units</th>\n",
" <th>run_type</th>\n",
" <th>schema_version</th>\n",
" <th>encode_version</th>\n",
" <th>biological_replicates</th>\n",
" <th>technical_replicates</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ADNP</td>\n",
" <td>K562</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>University of Chicago</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0002067</td>\n",
" <td>Kevin White, UChicago</td>\n",
" <td>ENCFF946EOR</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/04...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AEBP2</td>\n",
" <td>HEK293</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001182</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF649VIZ</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AFF1</td>\n",
" <td>K562</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0002067</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF870PDS</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/05...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AGO1</td>\n",
" <td>K562</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>University of California at San Diego</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0002067</td>\n",
" <td>Xiang-Dong Fu, UCSD</td>\n",
" <td>ENCFF054VBS</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/02...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AGO2</td>\n",
" <td>HepG2</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>University of California at San Diego</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001187</td>\n",
" <td>Xiang-Dong Fu, UCSD</td>\n",
" <td>ENCFF010SVM</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/02...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1036</th>\n",
" <td>ZSCAN5A</td>\n",
" <td>HEK293</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001182</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF446MUO</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1037</th>\n",
" <td>ZSCAN5C</td>\n",
" <td>HEK293</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001182</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF881ROP</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2017/03...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1038</th>\n",
" <td>ZSCAN9</td>\n",
" <td>HepG2</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>HudsonAlpha Institute for Biotechnology</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001187</td>\n",
" <td>Richard Myers, HAIB</td>\n",
" <td>ENCFF572RWA</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2018/11...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1039</th>\n",
" <td>ZXDB</td>\n",
" <td>HEK293</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0001182</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF342SQC</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/09...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1040</th>\n",
" <td>ZZZ3</td>\n",
" <td>K562</td>\n",
" <td>TF ChIP-seq</td>\n",
" <td>Stanford University</td>\n",
" <td>[1, 2]</td>\n",
" <td>human</td>\n",
" <td>EFO:0002067</td>\n",
" <td>Michael Snyder, Stanford</td>\n",
" <td>ENCFF716WJE</td>\n",
" <td>released</td>\n",
" <td>...</td>\n",
" <td>signal</td>\n",
" <td>fold change over control</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>26</td>\n",
" <td>3.0</td>\n",
" <td>[1, 2]</td>\n",
" <td>[1_1, 2_1]</td>\n",
" <td>https://encode-public.s3.amazonaws.com/2016/09...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1041 rows × 27 columns</p>\n",
"</div>"
],
"text/plain": [
" target cell_line assay_title institute_name \\\n",
"0 ADNP K562 TF ChIP-seq University of Chicago \n",
"1 AEBP2 HEK293 TF ChIP-seq Stanford University \n",
"2 AFF1 K562 TF ChIP-seq Stanford University \n",
"3 AGO1 K562 TF ChIP-seq University of California at San Diego \n",
"4 AGO2 HepG2 TF ChIP-seq University of California at San Diego \n",
"... ... ... ... ... \n",
"1036 ZSCAN5A HEK293 TF ChIP-seq Stanford University \n",
"1037 ZSCAN5C HEK293 TF ChIP-seq Stanford University \n",
"1038 ZSCAN9 HepG2 TF ChIP-seq HudsonAlpha Institute for Biotechnology \n",
"1039 ZXDB HEK293 TF ChIP-seq Stanford University \n",
"1040 ZZZ3 K562 TF ChIP-seq Stanford University \n",
"\n",
" string_biological_replicates organism term_id \\\n",
"0 [1, 2] human EFO:0002067 \n",
"1 [1, 2] human EFO:0001182 \n",
"2 [1, 2] human EFO:0002067 \n",
"3 [1, 2] human EFO:0002067 \n",
"4 [1, 2] human EFO:0001187 \n",
"... ... ... ... \n",
"1036 [1, 2] human EFO:0001182 \n",
"1037 [1, 2] human EFO:0001182 \n",
"1038 [1, 2] human EFO:0001187 \n",
"1039 [1, 2] human EFO:0001182 \n",
"1040 [1, 2] human EFO:0002067 \n",
"\n",
" title accession status ... output_category \\\n",
"0 Kevin White, UChicago ENCFF946EOR released ... signal \n",
"1 Michael Snyder, Stanford ENCFF649VIZ released ... signal \n",
"2 Michael Snyder, Stanford ENCFF870PDS released ... signal \n",
"3 Xiang-Dong Fu, UCSD ENCFF054VBS released ... signal \n",
"4 Xiang-Dong Fu, UCSD ENCFF010SVM released ... signal \n",
"... ... ... ... ... ... \n",
"1036 Michael Snyder, Stanford ENCFF446MUO released ... signal \n",
"1037 Michael Snyder, Stanford ENCFF881ROP released ... signal \n",
"1038 Richard Myers, HAIB ENCFF572RWA released ... signal \n",
"1039 Michael Snyder, Stanford ENCFF342SQC released ... signal \n",
"1040 Michael Snyder, Stanford ENCFF716WJE released ... signal \n",
"\n",
" output_type read_length read_length_units run_type \\\n",
"0 fold change over control NaN None None \n",
"1 fold change over control NaN None None \n",
"2 fold change over control NaN None None \n",
"3 fold change over control NaN None None \n",
"4 fold change over control NaN None None \n",
"... ... ... ... ... \n",
"1036 fold change over control NaN None None \n",
"1037 fold change over control NaN None None \n",
"1038 fold change over control NaN None None \n",
"1039 fold change over control NaN None None \n",
"1040 fold change over control NaN None None \n",
"\n",
" schema_version encode_version biological_replicates \\\n",
"0 26 3.0 [1, 2] \n",
"1 26 3.0 [1, 2] \n",
"2 26 3.0 [1, 2] \n",
"3 26 3.0 [1, 2] \n",
"4 26 3.0 [1, 2] \n",
"... ... ... ... \n",
"1036 26 3.0 [1, 2] \n",
"1037 26 3.0 [1, 2] \n",
"1038 26 3.0 [1, 2] \n",
"1039 26 3.0 [1, 2] \n",
"1040 26 3.0 [1, 2] \n",
"\n",
" technical_replicates url \n",
"0 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/04... \n",
"1 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/09... \n",
"2 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2017/05... \n",
"3 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2017/02... \n",
"4 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2017/02... \n",
"... ... ... \n",
"1036 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/09... \n",
"1037 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2017/03... \n",
"1038 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2018/11... \n",
"1039 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/09... \n",
"1040 [1_1, 2_1] https://encode-public.s3.amazonaws.com/2016/09... \n",
"\n",
"[1041 rows x 27 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_combined"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}