coursera/workflow.py
import os
import re
import abc
import time
import codecs
import logging
import subprocess
import requests
from .formatting import format_section, get_lecture_filename
from .playlist import create_m3u_playlist
from .utils import is_course_complete, mkdir_p, normalize_path
from .filtering import find_resources_to_get, skip_format_url
from .define import IN_MEMORY_MARKER
def _iter_modules(modules, class_name, path, ignored_formats, args):
"""
This huge function generates a hierarchy with hopefully more
clear structure of modules/sections/lectures.
"""
file_formats = args.file_formats
lecture_filter = args.lecture_filter
resource_filter = args.resource_filter
section_filter = args.section_filter
verbose_dirs = args.verbose_dirs
combined_section_lectures_nums = args.combined_section_lectures_nums
class IterModule(object):
def __init__(self, index, module):
self.index = index
self.name = '%02d_%s' % (index + 1, module[0])
self._module = module
@property
def sections(self):
sections = self._module[1]
for (secnum, (section, lectures)) in enumerate(sections):
if section_filter and not re.search(section_filter, section):
logging.debug('Skipping b/c of sf: %s %s',
section_filter, section)
continue
yield IterSection(self, secnum, section, lectures)
class IterSection(object):
def __init__(self, module_iter, secnum, section, lectures):
self.index = secnum
self.name = '%02d_%s' % (secnum, section)
self.dir = os.path.join(
path, class_name, module_iter.name,
format_section(secnum + 1, section,
class_name, verbose_dirs))
self._lectures = lectures
@property
def lectures(self):
for (lecnum, (lecname, lecture)) in enumerate(self._lectures):
if lecture_filter and not re.search(lecture_filter, lecname):
logging.debug('Skipping b/c of lf: %s %s',
lecture_filter, lecname)
continue
yield IterLecture(self, lecnum, lecname, lecture)
class IterLecture(object):
def __init__(self, section_iter, lecnum, lecname, lecture):
self.index = lecnum
self.name = lecname
self._lecture = lecture
self._section_iter = section_iter
def filename(self, fmt, title):
lecture_filename = get_lecture_filename(
combined_section_lectures_nums,
self._section_iter.dir, self._section_iter.index,
self.index, self.name, title, fmt)
return lecture_filename
@property
def resources(self):
resources_to_get = find_resources_to_get(
self._lecture, file_formats, resource_filter,
ignored_formats)
for fmt, url, title in resources_to_get:
yield IterResource(fmt, url, title)
class IterResource(object):
def __init__(self, fmt, url, title):
self.fmt = fmt
self.url = url
self.title = title
for index, module in enumerate(modules):
yield IterModule(index, module)
def _walk_modules(modules, class_name, path, ignored_formats, args):
"""
Helper generator that traverses modules in returns a flattened
iterator.
"""
for module in _iter_modules(modules=modules,
class_name=class_name,
path=path,
ignored_formats=ignored_formats,
args=args):
for section in module.sections:
for lecture in section.lectures:
for resource in lecture.resources:
yield module, section, lecture, resource
class CourseDownloader(object):
__metaclass__ = abc.ABCMeta
def __init__(self):
pass
@abc.abstractmethod
def download_modules(self, modules):
pass
class CourseraDownloader(CourseDownloader):
def __init__(self,
downloader,
commandline_args,
class_name,
path='',
ignored_formats=None,
disable_url_skipping=False):
super(CourseraDownloader, self).__init__()
self._downloader = downloader
self._args = commandline_args
self._class_name = class_name
self._path = path
self._ignored_formats = ignored_formats
self._disable_url_skipping = disable_url_skipping
self.skipped_urls = None if disable_url_skipping else []
self.failed_urls = []
def download_modules(self, modules):
completed = True
modules = _iter_modules(
modules, self._class_name, self._path,
self._ignored_formats, self._args)
for module in modules:
last_update = -1
for section in module.sections:
if not os.path.exists(section.dir):
mkdir_p(normalize_path(section.dir))
for lecture in section.lectures:
for resource in lecture.resources:
lecture_filename = normalize_path(
lecture.filename(resource.fmt, resource.title))
last_update = self._handle_resource(
resource.url, resource.fmt, lecture_filename,
self._download_completion_handler, last_update)
# After fetching resources, create a playlist in M3U format with the
# videos downloaded.
if self._args.playlist:
create_m3u_playlist(section.dir)
if self._args.hooks:
self._run_hooks(section, self._args.hooks)
# if we haven't updated any files in 1 month, we're probably
# done with this course
completed = completed and is_course_complete(last_update)
if completed:
logging.info('COURSE PROBABLY COMPLETE: ' + self._class_name)
# Wait for all downloads to complete
self._downloader.join()
return completed
def _download_completion_handler(self, url, result):
if isinstance(result, requests.exceptions.RequestException):
logging.error('The following error has occurred while '
'downloading URL %s: %s', url, str(result))
self.failed_urls.append(url)
elif isinstance(result, Exception):
logging.error('Unknown exception occurred: %s', result)
self.failed_urls.append(url)
def _handle_resource(self, url, fmt, lecture_filename, callback, last_update):
"""
Handle resource. This function builds up resource file name and
downloads it if necessary.
@param url: URL of the resource.
@type url: str
@param fmt: Format of the resource (pdf, csv, etc)
@type fmt: str
@param lecture_filename: File name of the lecture.
@type lecture_filename: str
@param callback: Callback that will be called when file has been
downloaded. It will be called even if exception occurred.
@type callback: callable(url, result) where result may be Exception
@param last_update: Timestamp of the newest file so far.
@type last_update: int
@return: Updated latest mtime.
@rtype: int
"""
overwrite = self._args.overwrite
resume = self._args.resume
skip_download = self._args.skip_download
# Decide whether we need to download it
if overwrite or not os.path.exists(lecture_filename) or resume:
if not skip_download:
if url.startswith(IN_MEMORY_MARKER):
page_content = url[len(IN_MEMORY_MARKER):]
logging.info('Saving page contents to: %s', lecture_filename)
with codecs.open(lecture_filename, 'w', 'utf-8') as file_object:
file_object.write(page_content)
else:
if self.skipped_urls is not None and skip_format_url(fmt, url):
self.skipped_urls.append(url)
else:
logging.info('Downloading: %s', lecture_filename)
self._downloader.download(callback, url, lecture_filename, resume=resume)
else:
open(lecture_filename, 'w').close() # touch
last_update = time.time()
else:
logging.info('%s already downloaded', lecture_filename)
# if this file hasn't been modified in a long time,
# record that time
last_update = max(last_update,
os.path.getmtime(lecture_filename))
return last_update
def _run_hooks(self, section, hooks):
original_dir = os.getcwd()
for hook in hooks:
logging.info('Running hook %s for section %s.',
hook, section.dir)
os.chdir(section.dir)
subprocess.call(hook)
os.chdir(original_dir)