18F/domain-scan

View on GitHub
scanners/third_parties.py

Summary

Maintainability
C
1 day
Test Coverage
import logging

from utils import utils

# Evaluate third party service usage using Chrome headless.

# Can also be run in Lambda.
lambda_support = True

# Signal that this is a JS-based scan using headless Chrome.
# The scan method will be defined in third_parties.js instead.
scan_headless = True


# Use pshtt data if we have it, to either skip redirect/inactive
# domains, or to start with the canonical URL right away.
def init_domain(domain, environment, options):
    cache_dir = options.get("_", {}).get("cache_dir", "./cache")
    # If we have data from pshtt, skip if it's not a live domain.
    if utils.domain_not_live(domain):
        logging.debug("\tSkipping, domain not reachable during inspection.")
        return False

    # If we have data from pshtt, skip if it's just a redirector.
    if utils.domain_is_redirect(domain, cache_dir=cache_dir):
        logging.debug("\tSkipping, domain seen as just an external redirector during inspection.")
        return False

    # To scan, we need a URL, not just a domain.
    url = None
    if not (domain.startswith('http://') or domain.startswith('https://')):

        # If we have data from pshtt, use the canonical endpoint.
        if utils.domain_canonical(domain, cache_dir=cache_dir):
            url = utils.domain_canonical(domain, cache_dir=cache_dir)

        # Otherwise, well, whatever.
        else:
            url = 'https://' + domain
    else:
        url = domain

    # Standardize by ending with a /.
    url = url + "/"

    return {'url': url}


# Gets the return value of scan(), convert to a CSV row.
def to_rows(data):

    return [[
        data['url'],
        len(data['external_domains']),
        str.join(" | ", data['external_domains']),
        str.join(" | ", data['external_urls']),
        str.join(" | ", data['nearby_domains']),
        str.join(" | ", data['nearby_urls']),
        str.join(" | ", data['known_services']),
        str.join(" | ", data['unknown_services']),
        str.join(" | ", data['page_urls']),
        str.join(" | ", data['page_domains'])
    ]]


headers = [
    'Scanned URL',
    'Number of External Domains',
    'External Domains',
    'External URLs',
    'Nearby Domains',
    'Nearby URLs',
    'Known Services',
    'Unknown Services',
    'URLs on the page',
    'Domains in the URLs on the page'
]