coursera/extractors.py
"""
This module contains implementation for extractors. Extractors know how
to parse site of MOOC platform and return a list of modules to download.
Usually they do not download heavy content, except when necessary
to parse course syllabus.
"""
import abc
import json
import logging
from .api import (CourseraOnDemand, OnDemandCourseMaterialItemsV1,
ModulesV1, LessonsV1, ItemsV2)
from .define import OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2
from .network import get_page
from .utils import is_debug_run, spit_json
class PlatformExtractor(object):
__metaclass__ = abc.ABCMeta
def get_modules(self):
"""
Get course modules.
"""
pass
class CourseraExtractor(PlatformExtractor):
def __init__(self, session):
self._notebook_downloaded = False
self._session = session
def list_courses(self):
"""
List enrolled courses.
@return: List of enrolled courses.
@rtype: [str]
"""
course = CourseraOnDemand(session=self._session,
course_id=None,
course_name=None)
return course.list_courses()
def get_modules(self, class_name,
reverse=False, unrestricted_filenames=False,
subtitle_language='en', video_resolution=None,
download_quizzes=False, mathjax_cdn_url=None,
download_notebooks=False):
page = self._get_on_demand_syllabus(class_name)
error_occurred, modules = self._parse_on_demand_syllabus(
class_name,
page, reverse, unrestricted_filenames,
subtitle_language, video_resolution,
download_quizzes, mathjax_cdn_url, download_notebooks)
return error_occurred, modules
def _get_on_demand_syllabus(self, class_name):
"""
Get the on-demand course listing webpage.
"""
url = OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2.format(
class_name=class_name)
page = get_page(self._session, url)
logging.debug('Downloaded %s (%d bytes)', url, len(page))
return page
def _parse_on_demand_syllabus(self, course_name, page, reverse=False,
unrestricted_filenames=False,
subtitle_language='en',
video_resolution=None,
download_quizzes=False,
mathjax_cdn_url=None,
download_notebooks=False
):
"""
Parse a Coursera on-demand course listing/syllabus page.
@return: Tuple of (bool, list), where bool indicates whether
there was at least on error while parsing syllabus, the list
is a list of parsed modules.
@rtype: (bool, list)
"""
dom = json.loads(page)
class_id = dom['elements'][0]['id']
logging.info('Parsing syllabus of on-demand course (id=%s). '
'This may take some time, please be patient ...',
class_id)
modules = []
json_modules = dom['linked']['onDemandCourseMaterialItems.v2']
course = CourseraOnDemand(
session=self._session, course_id=class_id,
course_name=course_name,
unrestricted_filenames=unrestricted_filenames,
mathjax_cdn_url=mathjax_cdn_url)
course.obtain_user_id()
ondemand_material_items = OnDemandCourseMaterialItemsV1.create(
session=self._session, course_name=course_name)
if is_debug_run():
spit_json(dom, '%s-syllabus-raw.json' % course_name)
spit_json(json_modules, '%s-material-items-v2.json' % course_name)
spit_json(ondemand_material_items._items,
'%s-course-material-items.json' % course_name)
error_occurred = False
all_modules = ModulesV1.from_json(
dom['linked']['onDemandCourseMaterialModules.v1'])
all_lessons = LessonsV1.from_json(
dom['linked']['onDemandCourseMaterialLessons.v1'])
all_items = ItemsV2.from_json(
dom['linked']['onDemandCourseMaterialItems.v2'])
for module in all_modules:
logging.info('Processing module %s', module.slug)
lessons = []
for section in module.children(all_lessons):
logging.info('Processing section %s', section.slug)
lectures = []
available_lectures = section.children(all_items)
# Certain modules may be empty-looking programming assignments
# e.g. in data-structures, algorithms-on-graphs ondemand
# courses
if not available_lectures:
lecture = ondemand_material_items.get(section.id)
if lecture is not None:
available_lectures = [lecture]
for lecture in available_lectures:
typename = lecture.type_name
logging.info('Processing lecture %s (%s)',
lecture.slug, typename)
# Empty dictionary means there were no data
# None means an error occurred
links = {}
if typename == 'lecture':
# lecture_video_id = lecture['content']['definition']['videoId']
# assets = lecture['content']['definition'].get(
# 'assets', [])
lecture_video_id = lecture.id
# assets = []
links = course.extract_links_from_lecture(
class_id,
lecture_video_id, subtitle_language,
video_resolution)
elif typename == 'supplement':
links = course.extract_links_from_supplement(
lecture.id)
elif typename == 'phasedPeer':
links = course.extract_links_from_peer_assignment(
lecture.id)
elif typename in ('gradedProgramming', 'ungradedProgramming'):
links = course.extract_links_from_programming(
lecture.id)
elif typename == 'quiz':
if download_quizzes:
links = course.extract_links_from_quiz(
lecture.id)
elif typename == 'exam':
if download_quizzes:
links = course.extract_links_from_exam(
lecture.id)
elif typename == 'programming':
if download_quizzes:
links = course.extract_links_from_programming_immediate_instructions(
lecture.id)
elif typename == 'notebook':
if download_notebooks and not self._notebook_downloaded:
logging.warning(
'According to notebooks platform, content will be downloaded first')
links = course.extract_links_from_notebook(
lecture.id)
self._notebook_downloaded = True
else:
logging.info(
'Unsupported typename "%s" in lecture "%s" (lecture id "%s")',
typename, lecture.slug, lecture.id)
continue
if links is None:
error_occurred = True
elif links:
lectures.append((lecture.slug, links))
if lectures:
lessons.append((section.slug, lectures))
if lessons:
modules.append((module.slug, lessons))
if modules and reverse:
modules.reverse()
# Processing resources section
json_references = course.extract_references_poll()
references = []
if json_references:
logging.info('Processing resources')
for json_reference in json_references:
reference = []
reference_slug = json_reference['slug']
logging.info('Processing resource %s',
reference_slug)
links = course.extract_links_from_reference(
json_reference['shortId'])
if links is None:
error_occurred = True
elif links:
reference.append(('', links))
if reference:
references.append((reference_slug, reference))
if references:
modules.append(("Resources", references))
return error_occurred, modules