coursera-dl/edx-dl

View on GitHub
test_edx_dl.py

Summary

Maintainability
A
0 mins
Test Coverage
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pytest
from edx_dl import edx_dl, parsing
from edx_dl.common import Unit, Video, DEFAULT_FILE_FORMATS


def test_failed_login():
    resp = edx_dl.edx_login(
        edx_dl.LOGIN_API, edx_dl.edx_get_headers(), "guest", "guest")
    assert not resp.get('success', False)


def test_remove_repeated_urls():
    url = "test/html/multiple_units.html"
    site = 'https://courses.edx.org'
    with open(url, "r") as f:
        html_contents = f.read()
        page_extractor = parsing.CurrentEdXPageExtractor()
        units_extracted = page_extractor.extract_units_from_html(html_contents,
                                                                 site,
                                                                 DEFAULT_FILE_FORMATS)

        all_units = {url: units_extracted}
        filtered_units = edx_dl.remove_repeated_urls(all_units)
        num_all_urls = edx_dl.num_urls_in_units_dict(all_units)
        num_filtered_urls = edx_dl.num_urls_in_units_dict(filtered_units)

        assert num_all_urls == 18
        assert num_filtered_urls == 16
        assert num_all_urls != num_filtered_urls


@pytest.fixture
def all_units():
    return {
        'empty_section': [],
        'nonempty_section': [Unit(videos=[], resources_urls=[]),
                             Unit(videos=[Video(video_youtube_url=None,
                                                available_subs_url=None,
                                                sub_template_url=None,
                                                mp4_urls=[])], resources_urls=[]),
                             Unit(videos=[Video(video_youtube_url=None,
                                                available_subs_url=None,
                                                sub_template_url=None,
                                                mp4_urls=['1', '2'])], resources_urls=['3']),
                             ]
    }


@pytest.fixture
def unknown_units():
    return {
        'nonempty_section': ['shouldfail']
    }


@pytest.fixture
def unknown_videos():
    return {
        'nonempty_section': [Unit(videos=['shoudfail'], resources_urls=['3'])]
    }


def test_extract_urls_from_units(all_units):
    """
    Make sure that urls are grabbed from both mp4_urls and from
    resources_urls of Unit class.
    """
    urls = edx_dl.extract_urls_from_units(all_units, '%(url)s')
    expected = ['1\n', '2\n', '3\n']
    assert sorted(urls) == sorted(expected)


def test_extract_urls_from_units_unknown_units(unknown_units):
    """
    Make sure that we only expect Units in the list of units.
    """
    with pytest.raises(TypeError):
        edx_dl.extract_urls_from_units(unknown_units, '%(url)s')


def test_extract_urls_from_units_unknown_videos(unknown_videos):
    """
    Make sure that we only expect Video in the list of Unit videos.
    """
    with pytest.raises(TypeError):
        edx_dl.extract_urls_from_units(unknown_videos, '%(url)s')


def test_edx_get_subtitle():
    """
    Make sure Stanford subtitle URLs are distinguished from EdX ones.
    """

    def mock_get_page_contents(u, h):
        assert u == url
        assert h == headers
        return u

    def mock_get_page_contents_as_json(u, h):
        assert u == url
        assert h == headers
        return { 'start' : [123], 'end' : [456], 'text' : ["subtitle content"] }

    url = "https://lagunita.stanford.edu/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_7f4f16e3eb294538aa8db4c43877132b/handler/transcript/download"
    headers = {}
    get_page_contents = lambda u, h: u

    expected = url
    actual = edx_dl.edx_get_subtitle(url, headers, mock_get_page_contents, mock_get_page_contents_as_json)
    assert expected == actual

    # Make sure Non-Stanford URLs still work
    url = "https://www.edx.org/could/be/more/realistic"

    expected = '0\n00:00:00,123 --> 00:00:00,456\nsubtitle content\n\n'
    actual = edx_dl.edx_get_subtitle(url, headers, mock_get_page_contents, mock_get_page_contents_as_json)
    assert expected == actual


def test_extract_subtitle_urls():
    text = """
<li class="video-tracks video-download-button">
            <a href="/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_1a4c7ff41e484a15927987b745a5c779/handler/transcript/download">Download transcript</a>
            <div class="a11y-menu-container">
                <a class="a11y-menu-button" href="#" title=".srt" role="button" aria-disabled="false">.srt</a>
                <ol class="a11y-menu-list" role="menu">
                  <li class="a11y-menu-item active">

                      <a class="a11y-menu-item-link" href="#srt" title="SubRip (.srt) file" data-value="srt" role="menuitem" aria-disabled="false">
                        SubRip (.srt) file
                      </a>
                  </li>
                  <li class="a11y-menu-item">

                      <a class="a11y-menu-item-link" href="#txt" title="Text (.txt) file" data-value="txt" role="menuitem" aria-disabled="false">
                        Text (.txt) file
                      </a>
                  </li>
                </ol>
            </div>
        </li>
    """

    page_extractor = parsing.CurrentEdXPageExtractor()
    expected = (None, 'https://base.url/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_1a4c7ff41e484a15927987b745a5c779/handler/transcript/download')
    actual = page_extractor.extract_subtitle_urls(text, "https://base.url")
    print("actual", actual)
    assert expected == actual