scanners/privacy.py from 18F/domain-scan

scanners/privacy.py
Summary

Maintainability

1 hr
Test Coverage

Issues
import logging
import urllib.request
import re
import requests

###
# Scan focused on learning about the /privacy page, as per
# https://github.com/18F/site-scanning/issues/89.


# Set a default number of workers for a particular scan type.
# Overridden by a --workers flag. XXX not actually overridden?
workers = 50


def mergelists(a, b):
    return list(set().union(a, b))


# Required scan function. This is the meat of the scanner, where things
# that use the network or are otherwise expensive would go.
#
# Runs locally or in the cloud (Lambda).
def scan(domain: str, environment: dict, options: dict) -> dict:
    logging.debug("Scan function called with options: %s" % options)

    results = {}
    url = 'https://' + domain + '/privacy'

    # get status_code for /privacy
    try:
        response = requests.head(url, allow_redirects=True, timeout=4)
        results['status_code'] = str(response.status_code)
        results['final_url'] = response.url
    except Exception:
        logging.debug("could not get data from %s", url)
        results['status_code'] = str(-1)
        results['final_url'] = ''

    # search /privacy for email addresses
    results['emails'] = []
    try:
        with urllib.request.urlopen(url, timeout=5) as privacypage:
            for _, line in enumerate(privacypage):
                line = line.decode().rstrip()
                emails = re.findall('<a href="mailto:(.*?)"', line)
                if emails:
                    results['emails'] = mergelists(emails, results['emails'])
    except Exception:
        logging.debug('error while trying to retrieve emails from %s', url)

    # search /privacy for H[123] tags
    results['h1'] = []
    results['h2'] = []
    results['h3'] = []
    try:
        with urllib.request.urlopen(url, timeout=5) as privacypage:
            for _, line in enumerate(privacypage):
                line = line.decode().rstrip()
                h1s = re.findall('<h1>(.*)</h1>', line)
                h2s = re.findall('<h2>(.*)</h2>', line)
                h3s = re.findall('<h3>(.*)</h3>', line)
                if h1s or h2s or h3s:
                    results['h1'] = mergelists(h1s, results['h1'])
                    results['h2'] = mergelists(h2s, results['h2'])
                    results['h3'] = mergelists(h3s, results['h3'])
    except Exception:
        logging.debug('error while trying to retrieve emails from %s', url)

    logging.warning("sitemap %s Complete!", domain)

    return results


# Required CSV row conversion function. Usually one row, can be more.
#
# Run locally.
def to_rows(data):
    row = []
    for page in headers:
        row.extend([data[page]])
    return [row]


# CSV headers for each row of data. Referenced locally.
headers = [
    'status_code',
    'final_url',
    'emails',
    'h1',
    'h2',
    'h3',
]