mpkato/mobileclick

View on GitHub
mobileclick/scripts/mobileclick_download_data.py

Summary

Maintainability
A
25 mins
Test Coverage
# -*- coding:utf-8 -*-
MOBILECLICK_URL = 'http://www.mobileclick.org'
SUBSET_FILENAME = 'MC2-training-documents-subset.tar.gz'

def download_and_deploy(datafilenames):
    import sys, os, getpass
    email = os.environ.get('MOBILECLICK_EMAIL', None)
    password = os.environ.get('MOBILECLICK_PASSWORD', None)
    if email is None or password is None:
        # if env variables are not set
        print "Input your email and password for %s/" % MOBILECLICK_URL
        print "Please sign up if you haven't got them."
        print
        sys.stdout.write('Email: ')
        email = raw_input()
        password = getpass.getpass()
        print
    if login(email, password):
        links = find_download_links(datafilenames)
        for link in links:
            filename = download_file(link)
            print "Extracting files ..."
            deploy_data(filename)
            print
        print "Finished downloading and extracting all the data."
        print "Please find downloaded files at './data'."
    else:
        print "Login failed"


def login(email, password):
    import urllib, urllib2, BeautifulSoup, re
    # cookie setting
    cookie = urllib2.HTTPCookieProcessor()
    opener = urllib2.build_opener(cookie)
    urllib2.install_opener(opener)

    # get the token
    res = urllib2.urlopen('%s/users/sign_in' % MOBILECLICK_URL)
    html = BeautifulSoup.BeautifulSoup(res.read())
    loginform = html.find('form', {'id': 'new_user'})
    token = loginform.find('input', {'name': 'authenticity_token'})['value']

    # login
    values = {'user[email]': email, 'user[password]': password}
    data = urllib.urlencode(values)
    req = urllib2.Request('%s/users/sign_in' % MOBILECLICK_URL, 
        data, {'X-CSRF-Token': token, 'Referer': '%s/users/sign_in' % MOBILECLICK_URL})
    res = urllib2.urlopen(req)
    result = BeautifulSoup.BeautifulSoup(res.read())
    suc = result.find('div', {'class': re.compile(r'\balert-success\b')})

    # login success?
    return True if suc else False

def download_file(url):
    import sys, urllib2
    filename = url.split('/')[-1]
    filename = filename.split('?')[0]
    res = urllib2.urlopen(url)
    meta = res.info()
    filesize = int(meta.getheaders("Content-Length")[0])
    print "Downloading: {0} (Bytes: {1})".format(url, filesize)

    filesize_obtained = 0
    blocksize = 8192
    with open(filename, 'wb') as f:
        while True:
            buf = res.read(blocksize)
            if not buf:
                break
            filesize_obtained += len(buf)
            f.write(buf)
            p = float(filesize_obtained) / filesize
            status = r"{0}  [{1:.2%}]".format(filesize_obtained, p)
            status += "[%s]" % ('*'*int(p*100) + ' '*(100 - int(p*100)) )
            status += chr(8)*(len(status)+1)
            sys.stdout.write(status)
            sys.stdout.flush()
    print
    print "Downloaded: {0}".format(url)
    return filename

def find_download_links(datafilenames):
    import urllib2, BeautifulSoup
    res = urllib2.urlopen('%s/home/data' % MOBILECLICK_URL)
    html = BeautifulSoup.BeautifulSoup(res.read())
    result = []
    for datafilename in datafilenames:
        def is_download_link(node):
            return node.name == 'a' and node['href'].endswith(datafilename)
        link = html.find(is_download_link)['href'] + '?dl=1'
        result.append(link)
    return result

def deploy_data(filename, datadir='./data'):
    import os, tarfile, zipfile, glob
    if tarfile.is_tarfile(filename):
        with tarfile.open(filename, 'r') as tf:
            tf.extractall(datadir)
    for filepath in glob.glob('%s/*/*.zip' % datadir):
        basedir = os.path.dirname(filepath)
        if zipfile.is_zipfile(filepath):
            with zipfile.ZipFile(filepath, 'r') as zf:
                zf.extractall(basedir + '/')