docs/tutobooks.py
"""Keras tutobooks implementation.
A tutobook is a tutorial available simultaneously as a notebook,
as a Python script, and as a nicely rendered webpage.
Its source-of-truth (for manual edition and version control) is
its Python script form, but you can also create one by starting
from a notebook and converting it with the command `nb2py`.
Text cells are stored in markdown-formatted comment blocks.
the first line (starting with " * 3) may optionally contain a special
annotation, one of:
- invisible: do not render this block.
- shell: execute this block while prefixing each line with `!`.
The script form should start with a header with the following fields:
Title:
Author: (could be `Authors`: as well, and may contain markdown links)
Date created: (date in yyyy/mm/dd format)
Last modified: (date in yyyy/mm/dd format)
Description: (one-line text description)
## How to add a new code example to Keras.io
You would typically start from an existing notebook.
Save it to disk (let's say as `path_to_your_nb.ipynb`).
`cd` to the `keras-io/scripts/` directory.
Then run:
```
python tutobooks nb2py path_to_your_nb.ipynb ../examples/your_example.py
```
This will create the file `examples/your_example.py`. Open it,
fill in the headers, and generally edit it so that it looks nice.
NOTE THAT THE CONVERSION SCRIPT MAY MAKE MISTAKES IN ITS ATTEMPTS
TO SHORTEN LINES. MAKE SURE TO PROOFREAD THE GENERATED .py IN FULL.
Or alternatively, make sure to keep your lines reasonably-sized (<90 char)
to start with, so that the script won't have to shorten them.
You can then preview what it looks like when converted back again
to ipynb by running:
```
python tutobooks py2nb ../examples/your_example.py preview.ipynb
```
NOTE THAT THIS COMMAND WILL ERROR OUT IF ANY CELLS TAKES TOO LONG
TO EXECUTE. In that case, make your code lighter/faster.
Remember that examples are meant to demonstrate workflows, not
train state-of-the-art models. They should
stay very lightweight.
Open the generated `preview.ipynb` and make sure it looks like what
you expect. If not, keep editing `your_example.py` until it does.
Finally, submit a PR adding `examples/your_example.py`.
"""
import json
import os
import random
import shutil
import sys
from pathlib import Path
TIMEOUT = 60 * 60
MAX_LOC = 300
def nb_to_py(nb_path, py_path):
f = open(nb_path)
content = f.read()
f.close()
nb = json.loads(content)
py = '"""\n'
py += "Title: FILLME\n"
py += "Author: FILLME\n"
py += "Date created: FILLME\n"
py += "Last modified: FILLME\n"
py += "Description: FILLME\n"
py += '"""\n'
for cell in nb["cells"]:
if cell["cell_type"] == "code":
# Is it a shell cell?
if (
cell["source"]
and cell["source"][0]
and cell["source"][0][0] == "!"
):
# It's a shell cell
py += '"""shell\n'
py += "".join(cell["source"]) + "\n"
py += '"""\n\n'
else:
# It's a Python cell
py += "".join(cell["source"]) + "\n\n"
elif cell["cell_type"] == "markdown":
py += '"""\n'
py += "".join(cell["source"]) + "\n"
py += '"""\n\n'
# Save file
f = open(py_path, "w")
f.write(py)
f.close()
# Format file with Black
os.system("black " + py_path)
# Shorten lines
py = open(py_path).read()
try:
py = _shorten_lines(py)
finally:
f = open(py_path, "w")
f.write(py)
f.close()
def py_to_nb(py_path, nb_path, fill_outputs=True):
f = open(py_path)
py = f.read()
f.close()
# validate(py)
# header, _, py, tag = _get_next_script_element(py)
# attributes = _parse_header(header)
cells = []
loc = 0
# Write first header cell
# header_cell = {
# "cell_type": "markdown",
# "source": [
# "# " + attributes["title"] + "\n",
# "\n",
# "**" + attributes["auth_field"] + ":** " + attributes["author"] +"<br>\n",
# "**Date created:** " + attributes["date_created"] + "<br>\n",
# "**Last modified:** " + attributes["last_modified"] + "<br>\n",
# "**Description:** " + attributes["description"],
# ],
# "metadata": {"colab_type": "text"},
# }
# cells.append(header_cell)
while py:
e, cell_type, py, tag = _get_next_script_element(py)
lines = e.split("\n")
if all(line == "" for line in lines):
continue
if lines and not lines[0]:
lines = lines[1:]
source = [line + "\n" for line in lines]
# Drop last newline char
if source and not source[-1].strip():
source = source[:-1]
if tag == "shell":
source = ["!" + line for line in source]
cell_type = "code"
if tag != "invisible" and source:
cell = {"cell_type": cell_type, "source": source}
if cell_type == "code":
cell["outputs"] = []
cell["metadata"] = {"colab_type": "code"}
cell["execution_count"] = 0
loc += _count_locs(source)
else:
cell["metadata"] = {"colab_type": "text"}
cells.append(cell)
notebook = {}
for key in NB_BASE.keys():
notebook[key] = NB_BASE[key]
notebook["metadata"]["colab"]["name"] = str(py_path).split("/")[-1][:-3]
notebook["cells"] = cells
if loc > MAX_LOC:
raise ValueError(
"Found %d lines of code, but expected fewer than %d"
% (loc, MAX_LOC)
)
f = open(nb_path, "w")
f.write(json.dumps(notebook, indent=1, sort_keys=True))
f.close()
if fill_outputs:
print("Generating ipynb")
parent_dir = Path(nb_path).parent
current_files = os.listdir(parent_dir)
try:
os.system(
"jupyter nbconvert --to notebook --execute --debug "
+ str(nb_path)
+ " --inplace"
+ " --ExecutePreprocessor.timeout="
+ str(TIMEOUT)
)
finally:
new_files = os.listdir(parent_dir)
for fname in new_files:
if fname not in current_files:
fpath = parent_dir / fname
if os.path.isdir(fpath):
print("Removing created folder:", fname)
shutil.rmtree(fpath)
else:
print("Removing created file:", fname)
os.remove(fpath)
def nb_to_md(nb_path, md_path, img_dir, working_dir=None):
img_exts = ("png", "jpg", "jpeg")
# Assumes an already populated notebook.
assert str(md_path).endswith(".md")
current_dir = os.getcwd()
original_img_dir = str(img_dir)
if original_img_dir.endswith("/"):
original_img_dir = original_img_dir[:-1]
img_dir = os.path.abspath(img_dir)
nb_path = os.path.abspath(nb_path)
nb_fname = str(nb_path).split("/")[-1]
del_working_dir = False
if working_dir is None:
del_working_dir = True
working_dir = "tmp_" + str(random.randint(1e6, 1e7))
if not os.path.exists(working_dir):
os.makedirs(working_dir)
print("Using working_dir:", working_dir)
os.chdir(working_dir)
shutil.copyfile(nb_path, nb_fname)
md_name = str(md_path).split("/")[-1][:-3]
target_md = md_name + ".md"
img_dir = Path(img_dir) / md_name
if not os.path.exists(img_dir):
os.makedirs(img_dir)
os.system(
# "jupyter nbconvert --to markdown --execute --debug "
"jupyter nbconvert --to markdown "
+ nb_fname
+ " --output "
+ target_md
# + " --ExecutePreprocessor.timeout="
# + str(TIMEOUT)
)
tmp_img_dir = md_name + "_files"
if os.path.exists(tmp_img_dir):
for fname in os.listdir(tmp_img_dir):
if fname.endswith(img_exts):
src = Path(tmp_img_dir) / fname
target = Path(img_dir) / fname
print("copy", src, "to", target)
shutil.copyfile(src, target)
os.chdir(current_dir)
md_content = open(Path(working_dir) / (md_name + ".md")).read()
for ext in img_exts:
md_content = md_content.replace(
"![" + ext + "](" + md_name + "_files",
"![" + ext + "](" + original_img_dir + "/" + md_name,
)
md_content = _make_output_code_blocks(md_content)
open(md_path, "w").write(md_content)
if del_working_dir:
shutil.rmtree(working_dir)
def py_to_md(py_path, nb_path, md_path, img_dir, working_dir=None):
py_to_nb(py_path, nb_path, fill_outputs=False)
nb_to_md(nb_path, md_path, img_dir, working_dir=working_dir)
def validate(py):
"""Validate the format of a tutobook script.
Specifically:
- validate headers
- validate style with black
"""
lines = py.split("\n")
if not lines[0].startswith('"""'):
raise ValueError('Missing `"""`-fenced header at top of script.')
if not lines[1].startswith("Title: "):
raise ValueError("Missing `Title:` field.")
if not lines[2].startswith("Author: ") and not lines[2].startswith(
"Authors: "
):
raise ValueError("Missing `Author:` field.")
if not lines[3].startswith("Date created: "):
raise ValueError("Missing `Date created:` field.")
if not lines[4].startswith("Last modified: "):
raise ValueError("Missing `Last modified:` field.")
if not lines[5].startswith("Description: "):
raise ValueError("Missing `Description:` field.")
description = lines[5][len("Description: ") :]
if not description:
raise ValueError("Missing `Description:` field content.")
if not description[0] == description[0].upper():
raise ValueError("Description field content must be capitalized.")
if not description[-1] == ".":
raise ValueError("Description field content must end with a period.")
if len(description) > 100:
raise ValueError(
"Description field content must be less than 100 chars."
)
for i, line in enumerate(lines):
if line.startswith('"""') and line.endswith('"""') and len(line) > 3:
raise ValueError(
'Do not use single line `"""`-fenced comments. '
"Encountered at line %d" % (i,)
)
for i, line in enumerate(lines):
if line.endswith(" "):
raise ValueError(
"Found trailing space on line %d; line: `%s`" % (i, line)
)
# Validate style with black
fpath = "/tmp/" + str(random.randint(1e6, 1e7)) + ".py"
f = open(fpath, "w")
pre_formatting = "\n".join(lines)
f.write(pre_formatting)
f.close()
os.system("black " + fpath)
f = open(fpath)
formatted = f.read()
f.close()
os.remove(fpath)
if formatted != pre_formatting:
raise ValueError(
"You python file did not follow `black` conventions. "
"Run `black your_file.py` to autoformat it."
)
def _count_locs(lines):
loc = 0
string_open = False
for line in lines:
line = line.strip()
if not line or line.startswith("#"):
continue
if not string_open:
if not line.startswith('"""'):
loc += 1
else:
if not line.endswith('"""'):
string_open = True
else:
if line.startswith('"""'):
string_open = False
return loc
def _shorten_lines(py):
max_len = 90
lines = []
for line in py.split("\n"):
if len(line) <= max_len:
lines.append(line)
continue
i = 0
while len(line) > max_len:
line = line.lstrip()
if " " not in line[1:]:
lines.append(line)
break
else:
short_line = line[:max_len]
line = line[max_len:]
if " " in short_line:
reversed_short_line = short_line[::-1]
index = reversed_short_line.find(" ") + 1
line = short_line[-index:] + line
short_line = short_line[:-index]
lines.append(short_line.lstrip())
i += 1
if i > 10:
raise
lines.append(line.lstrip())
return "\n".join(lines)
def _get_next_script_element(py):
lines = py.split("\n")
assert lines
elines = []
i = 0
tag = None
if lines[0].startswith('"""'):
assert len(lines) >= 2
etype = "markdown"
if len(lines[0]) > 3:
tag = lines[0][3:]
if tag not in ["shell", "invisible"]:
raise ValueError("Found unknown cell tag:", tag)
lines = lines[1:]
else:
etype = "code"
for i, line in enumerate(lines):
if line.startswith('"""'):
break
else:
elines.append(line)
if etype == "markdown":
py = "\n".join(lines[i + 1 :])
else:
py = "\n".join(lines[i:])
e = "\n".join(elines)
return e, etype, py, tag
def _parse_header(header):
lines = header.split("\n")
title = lines[0][len("Title: ") :]
author_line = lines[1]
if author_line.startswith("Authors"):
author = author_line[len("Authors: ") :]
auth_field = "Authors"
else:
author = author_line[len("Author: ") :]
auth_field = "Author"
date_created = lines[2][len("Date created: ") :]
last_modified = lines[3][len("Last modified: ") :]
description = lines[4][len("Description: ") :]
return {
"title": title,
"author": author,
"auth_field": auth_field,
"date_created": date_created,
"last_modified": last_modified,
"description": description,
}
def _make_output_code_blocks(md):
lines = md.split("\n")
output_lines = []
final_lines = []
is_inside_backticks = False
def is_output_line(line, prev_line, output_lines):
if line.startswith(" ") and len(line) >= 5:
if output_lines or (lines[i - 1].strip() == "" and line.strip()):
return True
return False
def flush(output_lines, final_lines):
final_lines.append('<div class="k-default-codeblock">')
final_lines.append("```")
if len(output_lines) == 1:
line = output_lines[0]
final_lines.append(line[4:])
else:
for line in output_lines:
final_lines.append(line[4:])
final_lines.append("```")
final_lines.append("</div>")
for i, line in enumerate(lines):
if line.startswith("```"):
is_inside_backticks = not is_inside_backticks
final_lines.append(line)
continue
if is_inside_backticks:
final_lines.append(line)
continue
if i > 0 and is_output_line(line, lines[-1], output_lines):
output_lines.append(line)
elif not line:
if output_lines:
if output_lines[-1]:
output_lines.append(line)
else:
final_lines.append(line)
else:
if output_lines:
flush(output_lines, final_lines)
output_lines = []
final_lines.append(line)
if output_lines:
flush(output_lines, final_lines)
return "\n".join(final_lines)
NB_BASE = {
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "", # FILL ME
"private_outputs": False,
"provenance": [],
"toc_visible": True,
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3",
},
"language_info": {
"codemirror_mode": {"name": "ipython", "version": 3},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0",
},
},
"nbformat": 4,
"nbformat_minor": 0,
}
if __name__ == "__main__":
cmd = sys.argv[1]
if cmd not in {"nb2py", "py2nb"}:
raise ValueError(
"Specify a command: either "
"`nb2py source_filename.ipynb target_filename.py` or "
"`py2nb source_filename.py target_file name.ipynb"
)
if len(sys.argv) < 4:
raise ValueError("Specify a source filename and a target filename")
source = sys.argv[2]
target = sys.argv[3]
if cmd == "py2nb":
if not source.endswith(".py"):
raise ValueError(
"The source filename should be a Python file. Got:", source
)
if not target.endswith(".ipynb"):
raise ValueError(
"The target filename should be a notebook file. Got:", target
)
py_to_nb(source, target)
if cmd == "nb2py":
if not source.endswith(".ipynb"):
raise ValueError(
"The source filename should be a notebook file. Got:", source
)
if not target.endswith(".py"):
raise ValueError(
"The target filename should be a Python file. Got:", target
)
nb_to_py(source, target)