video2commons/backend/download/__init__.py from Toollabs/video2commons

video2commons/backend/download/__init__.py
Summary

Maintainability

3 hrs
Test Coverage

Issues
#! /usr/bin/python
# -*- coding: UTF-8 -*-
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>
#

"""Wrapper around youtube-dl."""



import os
from urllib.parse import urlparse

from celery.utils.log import get_logger
import yt_dlp
from yt_dlp.utils import std_headers, DownloadError

from video2commons.exceptions import TaskError


def download(
    url, ie_key, formats, subtitles, outputdir,
    statuscallback=None, errorcallback=None
):
    """Download a video from url to outputdir."""

    if url.startswith('uploads:'):
        # FIXME; this should be a configuration variable
        url = url.replace('uploads:', 'https://video2commons.toolforge.org/'
                                      'static/uploads/', 1)
        ie_key = None

    url_blacklisted(url)

    outputdir = os.path.abspath(outputdir)
    statuscallback = statuscallback or (lambda text, percent: None)
    errorcallback = errorcallback or (lambda text: None)
    outtmpl = outputdir + '/dl.%(ext)s'

    params = {
        'format': formats,
        'outtmpl': outtmpl,
        'writedescription': True,
        'writeinfojson': True,
        'writesubtitles': subtitles,
        'writeautomaticsub': False,
        'subtitleslangs': ['all', '-live_chat'],
        'subtitlesformat': 'srt/ass/vtt/best',
        'cachedir': '/tmp/',
        'noplaylist': True,  # not implemented in video2commons
        'postprocessors': [{
            'key': 'FFmpegSubtitlesConvertor',
            'format': 'srt',
        }],
        'max_filesize': 5 * (1 << 30),
        'retries': 10,
        'fragment_retries': 10,
        'prefer_ffmpeg': True,  # avconv do not have srt encoder
        'prefer_free_formats': True,
        'logger': get_logger('celery.task.v2c.main.yt_dlp')
    }

    old_ua = std_headers['User-Agent']
    if ie_key == 'Youtube':
        # HACK: Get equirectangular for 360° videos (ytdl-org/youtube-dl#15267)
        std_headers['User-Agent'] = ''

    last_percentage = [Ellipsis]

    def progresshook(d):
        if d['status'] == 'downloading':
            total = d.get('total_bytes') or d.get('total_bytes_estimate')
            percentage = int(100.0 * d['downloaded_bytes'] / total)\
                if total else None
            if percentage != last_percentage[0]:
                last_percentage[0] = percentage
                statuscallback(
                    'Downloading to ' + (d['tmpfilename'] or d['filename']),
                    percentage
                )
        elif d['status'] == 'finished':
            statuscallback('Postprocessing...', -1)
        elif d['status'] == 'error':
            errorcallback('Error raised by YoutubeDL')

    statuscallback('Creating YoutubeDL instance', -1)
    dl = yt_dlp.YoutubeDL(params)
    dl.add_progress_hook(progresshook)

    statuscallback('Preprocessing...', -1)
    try:
        # Not using provided ie_key because of the existance of extractors that
        # targets another extractor, such as TwitterIE.
        info = dl.extract_info(url, download=True, ie_key=None)
    except DownloadError:
        params['cachedir'] = False
        statuscallback('Download failed.'
                       ' creating YoutubeDL instance without local cache', -1)
        dl = yt_dlp.YoutubeDL(params)
        dl.add_progress_hook(progresshook)
        info = dl.extract_info(url, download=True, ie_key=None)

    finally:
        std_headers['User-Agent'] = old_ua

    if info.get('webpage_url'):
        url_blacklisted(info['webpage_url'])

    filename = outtmpl % {'ext': info['ext']}
    if not os.path.isfile(filename):
        # https://github.com/rg3/youtube-dl/issues/8349
        filename = outtmpl % {'ext': 'mkv'}
        assert os.path.isfile(filename), \
            'Failed to determine the path of the downloaded video. ' + \
            'Is the video too large?'

    ret = {
        'extractor': ie_key,
        'subtitles': {},
        'target': filename,
    }

    for key in info.get('subtitles', {}):
        # Postprocesed: converted to srt
        filename = outtmpl % {'ext': key + '.srt'}
        if os.path.isfile(filename):
            ret['subtitles'][key] = filename

    return ret


def url_blacklisted(url):
    """Define download url blacklist."""
    parseresult = urlparse(url)
    if parseresult.scheme in ['http', 'https']:
        if parseresult.netloc.endswith('.googlevideo.com'):
            raise TaskError('Your downloading URL has been blacklisted.')