netflix-dvd-feed.py from dblume/netflix-dvd-feed

netflix-dvd-feed.py
Summary

Maintainability

4 hrs
Test Coverage

Issues
#!/usr/bin/env python3
import sys
import os
import io
import time
import smtplib
import traceback
from argparse import ArgumentParser
import imaplib
import email
from email.header import decode_header
import quopri
import re
import cgi
import cfgreader
import urllib.request
import html

# Read in custom configurations
g_cfg = cfgreader.CfgReader(__file__.replace('.py', '.cfg'))

# These two strings will form the header and individual
# items of the RSS feed.
feed_header = """<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>DVDs shipped for %s</title>
<link>http://dvd.netflix.com/Queue</link>
<atom:link href="http://%s/%s.xml" rel="self" type="application/rss+xml" />
<pubDate>%%s</pubDate>
<description>Feed automatically generated by %s</description>
<language>en-us</language>
""" % (g_cfg.main.name, g_cfg.main.url_base,
       g_cfg.main.rss_base, g_cfg.main.url_base)

feed_item = """<item>
<title>%s</title>
<pubDate>%s</pubDate>
<link>%s</link>
<guid isPermaLink="false">%s</guid>
<description>The disc &lt;a href="%s"&gt;%s&lt;/a&gt; was shipped on %s.</description>
</item>
"""


def set_v_print(verbose):
    """
    Defines the function v_print.
    It prints if verbose is true, otherwise, it does nothing.
    See: http://stackoverflow.com/questions/5980042
    :param verbose: A bool to determine if v_print will print its args.
    """
    global v_print
    v_print = print if verbose else lambda *a, **k: None


def send_email(subject, msg, toaddrs,
               fromaddr='"%s" <%s>' % (os.path.basename(__file__),
                                       g_cfg.smtp.from_addr)):
    """ Sends Email
    This function is only used in an emergency.
    """
    smtp = smtplib.SMTP('localhost', port=g_cfg.smtp.port)
    smtp.login(g_cfg.smtp.user, g_cfg.smtp.password)
    smtp.sendmail(fromaddr, toaddrs,
                  "Content-Type: text/plain; charset=\"us-ascii\"\r\n"
                  "From: %s\r\nTo: %s\r\nSubject: %s\r\n%s" %
                  (fromaddr, ", ".join(toaddrs), subject, msg))
    smtp.quit()


def write_feed(script_dir, feed_items):
    """ Given a list of feed_items, write an FSS feed. """
    now = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
    index = 0
    do_move = False
    temp_fname = os.path.join(script_dir, g_cfg.main.rss_base + '.temp.xml')
    dest_fname = os.path.join(script_dir, g_cfg.main.rss_base + '.xml')
    with open(temp_fname, 'wb') as f:
        f.write(bytes(feed_header % (now,), 'utf-8'))
        for title, url, ship_date in reversed(feed_items):
            title = cgi.escape(title)
            guid = "%s+%s+%d" % (url, now, index)
            f.write(bytes(feed_item % (title,
                                 ship_date,
                                 url,
                                 guid,
                                 url, title, ship_date[:-15]), 'utf-8'))
            index += 1
        f.write(bytes('</channel></rss>', 'utf-8'))
        do_move = True
    if do_move:
        os.rename(temp_fname, dest_fname)
        return "OK (Wrote %d new item%s.)" % \
               (len(feed_items), len(feed_items) > 1 and "s" or "")
    return "Could not update the feed file."


def get_titles_from_html_part(part, charset, debug):
    """ Gets the titles *and* URLs with the same regex. """
    titlepat = re.compile(
        """<h2\ style="box-sizing
           (?:[^<>]+?)     # No-capture nongreedy all the style
           ><a\ class="
           (?:[m_\-\d]*?)  # Sometimes it's a modified class name
           medium"\ href="
           ([^"]+?)"       # Capture the URL, everything up to the quotes
           \ (?:[^<>]+?)>  # Ignore more style stuff, up to a >
           (.+?)           # Capture the title
           [\s]*           # whitespace, incl newlines
           </a></h2>""", re.MULTILINE | re.VERBOSE)
    # txt = quopri.decodestring(part.get_payload()).decode(encoding=charset)
    txt = part.get_payload(decode=True).decode(encoding=charset)

    if debug:
        n = 0
        for l in txt.splitlines():
           print("%03d: \"%s\"" % (n, l))
           n += 1

    titles = []
    urls = []
    matches = titlepat.search(txt)
    if matches is None:
        v_print("Found no titles in html part of message.")
        return ("NO",
               ["The HTML doesn't border the title with the expected markup."],
               [])
    while matches is not None:
        urls.append(matches.groups()[0])
        titles.append(html.unescape(matches.groups()[1].strip()))
        txt = txt[matches.end():]
        matches = titlepat.search(txt)
    v_print("Found %d titles: %s" % (len(titles), str(titles)))
    return "OK", titles, urls


def subject_is_recognized(subject):
    """ Returns whether this subject line is known to be associated
    with email that contains a title of a DVD to be shipped.
    """
    recognized_subjects = ["We sent you ", "We shipped you ", "We shipped the last "]
    if any(subject.startswith(words) for words in recognized_subjects):
        return True
    return subject.startswith("For ") and subject.find(':') != -1


def resolve_redirects(url):
    """ Returns the final URL of a potential redirect """
    try:
        res = urllib.request.urlopen(url)
        url = res.geturl()
    except:
        print("Failed to resolve redirect of", url)
        pass
    return url


def main(script_dir, debug):
    """ Fetch all the mail, and try to find messages that
    match a pattern like, "For Wed: Some Movie".

    It'll add those movies to the RSS feed, and if that's
    completed successfully, then it'll delete the processed email.
    """
    server = imaplib.IMAP4(g_cfg.imap.mailbox)
    server.login(g_cfg.imap.user, g_cfg.imap.password)
    server.select()
    status, data = server.search(None, 'ALL')
    if status != 'OK':
        raise Exception('Getting the list of messages resulted in %s' % status)

    messages_to_delete = []
    feed_items = []
    for num in data[0].split():  # For each email message...
        status, data = server.fetch(num, '(RFC822)')
        if status != 'OK':
            raise Exception('Fetch message %s resulted in %s' % (num, status))
        msg = email.message_from_bytes(data[0][1])
        subject = msg['Subject']
        if subject.startswith('=?UTF-') or subject.startswith('=?utf-'):
            subject, encoding = decode_header(subject)[0]
            subject = subject.decode(encoding)

        if subject.startswith('Fwd: '):
            subject = subject[5:]

        if subject.startswith("We received") or \
           subject.startswith("Shipping today!"):
            # We will bypass, but delete this message as recognized
            v_print("Bypassing message", num, subject)
            messages_to_delete.append(num)
            continue

        if not subject_is_recognized(subject):
            print('Subject "%s" was unexpected, but the script is continuing. '
                  'Please take a look at the mailbox.' % subject)
            continue

        v_print("Processing message", num, subject)
        titles = None
        for part in msg.walk():
            # multipart/* are just containers
            if part.get_content_maintype() == 'multipart':
                v_print("Skipping multipart part looking for text/html.")
                continue
            content_type = part.get_content_type()
            if content_type == "text/plain":
                print("Netflix started using %s again!" % content_type)
                print("Consider restoring simpler code from git repo!")
            elif content_type == "text/html":
                status, titles, urls = get_titles_from_html_part(part,
                                      part.get_content_charset('utf-8'), debug)
                if status != 'OK':
                    raise Exception(titles[0])

        messages_to_delete.append(num)
        # Append the movie names and URLs to a list of items
        for i in range(len(titles)):
            feed_items.append((titles[i], resolve_redirects(urls[i]),
                               msg['Date']))

    # Ensure the new feed is written
    update_status = "OK"
    if len(feed_items) > 0:
        update_status = write_feed(script_dir, feed_items)

    if update_status.startswith("OK") and debug == False:
        # Now delete only the messages marked for deletion
        for num in messages_to_delete:
            server.store(num, '+FLAGS', '\\Deleted')
        server.expunge()

    server.close()
    server.logout()
    print(update_status)
    return update_status


if __name__ == '__main__':
    # Everything here and below is boiler-plate for cron jobs.
    script_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
    parser = ArgumentParser(description="cronjob to create Netflix DVD feed.")
    parser.add_argument('-d', '--debug', action='store_true')
    parser.add_argument('-v', '--verbose', action='store_true')
    args = parser.parse_args()
    set_v_print(args.verbose)

    start_time = time.time()
    if args.debug:
        message = main(script_dir, args.debug)
    else:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = sys.stderr = io.StringIO()
        try:
            main(script_dir, args.debug)
        except Exception as e:
            exceptional_text = "Exception: " + str(e.__class__) + " " + str(e)
            print(exceptional_text)
            traceback.print_exc(file=sys.stdout)
            try:
                send_email('Exception thrown in %s' % (os.path.basename(__file__),),
                           exceptional_text + "\n" + traceback.format_exc(),
                           (g_cfg.smtp.to,))
            except Exception as e:
                traceback.print_exc(file=sys.stdout)
                print("Could not send email to notify you of the exception. :(")

        message = sys.stdout.getvalue()
        sys.stdout = old_stdout
        sys.stderr = old_stderr

    # Finally, let's save this to a statistics page
    if os.path.exists(os.path.join(script_dir, g_cfg.main.logfile)):
        with open(os.path.join(script_dir, g_cfg.main.logfile), 'r', encoding='utf-8') as f:
            lines = f.readlines()
    else:
        lines = []
    lines = lines[:168]  # Just keep some recent lines
    status = u'\n                       '.join(message.splitlines())
    lines.insert(0, u"%s %3.0fs %s\n" % (time.strftime('%Y-%m-%d, %H:%M', time.localtime()),
                                         time.time() - start_time,
                                         status))
    with open(os.path.join(script_dir, g_cfg.main.logfile), 'w', encoding='utf-8') as f:
        f.writelines(lines)