comic_abstract.py
#! /usr/bin/python3
# vim: set expandtab tabstop=4 shiftwidth=4 :
"""Module to define logic common to all comics."""
import json
import time
import os
from datetime import date
from urlfunctions import get_filename_from_url, get_file_at_url
import inspect
import logging
def get_date_for_comic(comic):
"""Return date object for a given comic."""
return date(comic["year"], comic["month"], comic["day"])
def get_info_before_comic(comic):
"""Generates the info to be put before the images."""
author = comic.get("author")
if author:
yield "by " + author
def get_info_after_comic(comic):
"""Generates the info to be put after the images."""
for name in ["alt", "title", "title2", "texts", "name", "description"]:
info = comic.get(name)
if info:
yield info
class GenericComic(object):
"""Generic class to handle the logic common to all comics
Attributes :
name Name of the comic (for logging, CLI and default output dir)
long_name Long name of the comic (to be added in the comic info)
url Base url for the comic (without trailing slash)."""
name = None
long_name = None
url = None
_categories = ("ALL",)
@classmethod
def log(cls, string):
"""Dirty logging function."""
# TODO: https://docs.python.org/2/library/logging.html#logrecord-attributes
# we do not need to retrieve the function name manually
logging.debug(inspect.stack()[1][3] + " " + cls.name + " " + string)
@classmethod
def _get_output_dir(cls):
"""Returns the name of the output directory (for comics and JSON file).
To be overridden if needed."""
return cls.name
@classmethod
def _create_output_dir(cls):
"""Create output directory for the comic on the file system."""
cls.log("start")
os.makedirs(cls._get_output_dir(), exist_ok=True)
cls.log("done")
@classmethod
def _get_json_file_path(cls):
"""Get the full path to the JSON file."""
return os.path.join(cls._get_output_dir(), cls.name + ".json")
@classmethod
def _load_db(cls):
"""Load the JSON file to return the list of comics."""
return cls._load_db_from_file(cls._get_json_file_path())
@classmethod
def _load_db_from_file(cls, filepath):
"""Load the JSON file to return the list of comics."""
cls.log("start")
try:
with open(filepath) as file:
return json.load(file)
except IOError:
return []
@classmethod
def get_comics(cls):
"""Return the list of comics."""
return [c for c in cls._load_db() if "deleted" not in c]
@classmethod
def get_last_comic(cls, comics):
"""Return the last (non-deleted) comic."""
return next((c for c in reversed(comics) if "deleted" not in c), None)
@classmethod
def _save_db(cls, data):
"""Save the list of comics in the JSON file."""
return cls._save_db_in_file(data, cls._get_json_file_path())
@classmethod
def _save_db_in_file(cls, data, filepath):
"""Save the list of comics in the JSON file."""
cls.log("start")
with open(filepath, "w+") as file:
try:
json.dump(data, file, indent=4, sort_keys=True)
except KeyboardInterrupt as e:
print("Caught exception %s - will finish saving the DB first" % e)
json.dump(data, file, indent=4, sort_keys=True)
raise
cls.log("done")
@classmethod
def get_file_in_output_dir(cls, url, prefix=None, referer=None):
"""Download file from URL and save it in output folder."""
cls.log("start (url:%s)" % url)
filename = os.path.join(
cls._get_output_dir(),
("" if prefix is None else prefix) + get_filename_from_url(url),
)
return get_file_at_url(url, filename, referer)
@classmethod
def check_everything_is_ok(cls):
"""Perform tests on the database to check that everything is ok."""
cls.log("start")
print(cls.name, ": about to check")
comics = cls.get_comics() # cls._load_db()
imgs_paths = {}
imgs_urls = {}
prev_date, prev_num = None, None
today = date.today()
for i, comic in enumerate(comics):
cls.print_comic(comic, i)
url = comic.get("url")
assert isinstance(url, str), "Url %s not a string" % url
assert comic.get("comic") == cls.long_name
assert all(
isinstance(comic.get(k), int) for k in ["day", "month", "year"]
), ("Invalid date data (%s)" % url)
curr_date = get_date_for_comic(comic)
assert curr_date <= today
curr_num = comic.get("num", 0)
assert isinstance(curr_num, int)
assert prev_date is None or prev_date <= curr_date or prev_num < curr_num, (
"Comics are not in order (%s)" % url
)
prev_date, prev_num = curr_date, curr_num
img = comic.get("img")
local_img = comic.get("local_img")
assert isinstance(img, list)
assert isinstance(local_img, list)
assert len(local_img) == len(img)
for path in local_img:
if path is not None:
assert os.path.isfile(path)
imgs_paths.setdefault(path, set()).add(i)
for img_url in img:
imgs_urls.setdefault(img_url, set()).add(i)
print()
if False: # To check if imgs are not overriding themselves
for path, nums in imgs_paths.items():
if len(nums) > 1:
print("Image used multiple times", path, nums)
for img_url, nums in imgs_urls.items():
if len(nums) > 1:
print("Url used multiple times", img_url, nums)
if False: # To check that all files in folder are useful
json = cls._get_json_file_path()
output_dir = cls._get_output_dir()
for file_ in os.listdir(output_dir):
file_path = os.path.join(output_dir, file_)
if file_path not in imgs_paths and file_path != json:
print("Unused image", file_path)
cls.log("done")
@classmethod
def get_next_comic(cls, _):
"""Generator to get the next comic.
First argument is the last properly downloaded comic which gives
a starting point to download more.
This is the method called by update(). It should yield comics which
are basically dictionnaries with the following property :
- 'url' is linked to a string
- 'img' is linked to a list of url (that will get downloaded)
- 'date' is an optional date object (with 'day', 'month' and
'year' attribute) - if none is provided, today is used.
- more fields can be provided."""
raise NotImplementedError
@classmethod
def print_text(cls, text):
"""Print text by returning to the beginning of the line every time."""
print(cls.name, ":", text, " " * 10, "\r", end="")
@classmethod
def print_comic(cls, comic, index=None):
"""Print information about a comic."""
s = comic["url"]
if index is None:
cls.print_text(s)
else:
cls.print_text("%d %s" % (index, s))
@classmethod
def update(cls, saving_freq=100):
"""Update the database : get the latest comics and save in the DB.
This is a wrapper around get_next_comic() providing the following
generic features :
- logging
- database handling (open and save)
- exception handling (properly retrieved data are always saved)
- file download
- data management (adds current date if no date is provided)."""
cls.log("start")
# print(cls.name, ': about to update')
cls._create_output_dir()
comics = cls._load_db()
new_comics = []
start = time.time()
try:
last_comic = cls.get_last_comic(comics)
cls.log(
"last comic is %s"
% ("None" if last_comic is None else last_comic["url"])
)
for i, comic in enumerate(cls.get_next_comic(last_comic), 1):
cls.log("got %s" % str(comic))
assert "url" in comic
assert "img" in comic
assert "day" not in comic
assert "month" not in comic
assert "year" not in comic
date_ = comic.pop("date", date.today())
comic["day"], comic["month"], comic["year"] = (
date_.day,
date_.month,
date_.year,
)
prefix = comic.get("prefix", "")
assert "local_img" not in comic
comic["local_img"] = [
cls.get_file_in_output_dir(i, prefix, referer=comic["url"])
for i in comic["img"]
]
assert "comic" not in comic
comic["comic"] = cls.long_name
assert "new" not in comic
comic["new"] = None # "'new' in comic" to check if new
new_comics.append(comic)
cls.print_comic(comic, i)
if i % saving_freq == 0:
end = time.time()
delta = end - start
print(
cls.name,
": got",
i,
"comics in",
delta,
"seconds so far - saving just in case",
)
cls._save_db(comics + new_comics)
finally:
if new_comics:
end = time.time()
new_len, delta = len(new_comics), end - start
print(cls.name, ": got", new_len, "comics in", delta, "seconds")
cls._save_db(comics + new_comics)
print(cls.name, ": added", new_len, "comics in", delta, "seconds")
else:
print(cls.name, ": nothing new")
cls.log("done")
@classmethod
def try_to_get_missing_resources(cls):
"""Download images that might not have been downloaded properly in
the first place."""
cls.log("start")
print(cls.name, ": about to try to get missing resources")
cls._create_output_dir()
comics = cls._load_db()
change = False
for comic in comics:
comicurl = comic["url"]
local = comic["local_img"]
prefix = comic.get("prefix", "")
for i, (path, url) in enumerate(zip(local, comic["img"])):
if path is None:
new_path = cls.get_file_in_output_dir(url, prefix, referer=comicurl)
if new_path is None:
print(cls.name, ": failed to get", url)
else:
print(cls.name, ": got", url, "at", new_path)
local[i] = new_path
change = True
comic["new"] = None
if change:
cls._save_db(comics)
print(cls.name, ": some missing resources have been downloaded")
cls.log("done")
@classmethod
def reset_new(cls):
"""Remove the 'new' flag on comics in the DB."""
cls.log("start")
cls._create_output_dir()
cls._save_db(
[
{key: val for key, val in c.items() if key != "new"}
for c in cls._load_db()
]
)
cls.log("done")
@classmethod
def delete_last(cls):
"""Delete last (non-deleted) comic."""
cls.log("start")
comics = cls._load_db()
last_comic = cls.get_last_comic(comics)
if last_comic is None:
cls.log("no comic to delete")
else:
cls.log("about to delete %s" % last_comic["url"])
last_comic["deleted"] = None # "'deleted' in comic" to check if deleted
cls._save_db(comics)
cls.log("done")
@classmethod
def delete_all(cls):
"""Delete all comics."""
cls.log("start")
comics = cls._load_db()
if comics:
for c in comics:
c["deleted"] = None # "'deleted' in comic" to check if deleted
cls._save_db(comics)
cls.log("done")
@classmethod
def print_name(cls):
"""Print name."""
cls.log("start")
print(cls.name)
cls.log("end")
@classmethod
def info(cls):
"""Print information about the comics."""
cls.log("start")
print("%s (%s) : " % (cls.long_name, cls.url))
print("In " + ", ".join(cls.get_categories()))
print("Implemented via " + ", ".join((c.__name__ for c in cls.__mro__)))
cls._create_output_dir()
comics = cls.get_comics() # cls._load_db()
dates = [get_date_for_comic(c) for c in comics]
print(
"%d comics (%d new)" % (len(comics), sum(1 for c in comics if "new" in c))
)
print("%d images" % sum(len(c["img"]) for c in comics))
if dates:
date_min, date_max = min(dates), max(dates)
print(
"from %s to %s (%d days)"
% (date_min, date_max, (date_max - date_min).days)
)
print()
cls.log("done")
@classmethod
def readme(cls):
"""Return information to generate README."""
return " * [%s](%s)\n" % (cls.long_name, cls.url)
@classmethod
def gitignore(cls):
"""Return information to generate gitignore."""
return "%s\n" % (cls.name)
@classmethod
def get_categories(cls):
"""Return categories to be able to group comics.
Categories are such that all classes have their ancestrors'
categories and their own (provided as an iterable in the
`_categories` class member)."""
return sorted(
set(
cat
for klass in cls.__mro__
for cat in getattr(klass, "_categories", [])
)
)