gather from 18F/domain-scan

gather
Summary

Maintainability

Test Coverage

Issues
#!/usr/bin/env python3

import os
import glob
import sys
import re
import csv
import requests
import logging
import importlib

from utils import utils

# some metadata about the scan itself
start_time = utils.local_now()
start_command = str.join(" ", sys.argv)

# Applied if --ignore-www is enabled.
strip_www = re.compile("^www\.")

# Applied to all domains.
strip_protocol = re.compile("^https?://")
strip_wildcard = re.compile("^(\*.)+")
strip_redacted = re.compile("^(\?\.)+")


def run(options=None, cache_dir="./cache", results_dir="./results"):

    sources = options["gatherers"]

    suffixes = options.get("suffix")
    suffix_pattern = utils.suffix_pattern(suffixes)

    # Clear out existing result CSVs, to avoid inconsistent data.
    for result in glob.glob("%s/*.csv" % results_dir):
        os.remove(result)

    # Opt in to include parent (second-level) domains.
    include_parents = options.get("include_parents", False)

    # Opt into stripping www. prefixes from hostnames, effectively
    # collapsing www.[host] and [host] into one record.
    ignore_www = options.get("ignore_www", False)

    # --parents should be a CSV whose first column is parent domains
    # that will act as a whitelist for which subdomains to gather.
    parents = get_parent_domains(options, cache_dir=cache_dir)

    # De-duping hostnames. This will cause the system to hold all
    # hostnames in memory at once, but oh well.
    hostnames_cache = {}

    for source in sources:
        extra = {}

        try:
            gatherer_module = importlib.import_module(
                "gatherers.%s" % source)
            gatherer = gatherer_module.Gatherer(suffixes, options, extra)
        except ImportError:
            # If it's not a registered module, allow it to be "hot registered"
            # as long as the user gave us a flag with that name that can be
            # used as the --url option to the URL module.
            if options.get(source):
                gatherer_module = importlib.import_module("gatherers.url")
                extra['name'] = source
                gatherer = gatherer_module.Gatherer(suffixes, options, extra)
            else:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                logging.error("[%s] Gatherer not found, or had an error during loading.\n\tERROR: %s\n\t%s" % (source, exc_type, exc_value))
                exit(1)

        # Iterate over each hostname.
        for domain in gatherer.gather():

            # Always apply the suffix filter to returned names.
            if not suffix_pattern.search(domain):
                continue

            # Strip off whitespace before pre-processing.
            domain = domain.strip()

            # Cut off protocols, if present.
            domain = strip_protocol.sub("", domain)

            # Cut naive wildcard prefixes out. (from certs)
            domain = strip_wildcard.sub("", domain)

            # Cut off any redaction markers from names. (from certs)
            domain = strip_redacted.sub("", domain)

            # Strip www. prefixes from hostnames, effectively
            # collapsing www.[host] and [host] into one record.
            if ignore_www:
                domain = strip_www.sub("", domain)

            # Strip off whitespace after pre-processing.
            domain = domain.strip()

            base = utils.base_domain_for(domain)

            # Unless --include-parents is specified, exclude them.
            if not include_parents:
                # Always ignore www prefixes for base domains.
                if (domain == base) or (domain == "www.%s" % base):
                    continue

            # Apply --parent domain whitelist, if present.
            if parents:
                if base not in parents:
                    continue

            # Use hostname cache to de-dupe, if seen before.
            if domain not in hostnames_cache:
                hostnames_cache[domain] = [source]
            elif source not in hostnames_cache[domain]:
                hostnames_cache[domain] += [source]

    # Now that we've gone through all sources and logged when each
    # domain appears in each one, go through cache and write
    # all of them to disk.

    # Assemble headers.
    headers = ["Domain", "Base Domain"]
    # Add headers dynamically for each source.
    headers += sources

    # Open CSV file.
    gathered_filename = "%s/%s.csv" % (results_dir, "gathered")
    gathered_file = open(gathered_filename, 'w', newline='')
    gathered_writer = csv.writer(gathered_file)
    gathered_writer.writerow(headers)

    # Write each hostname to disk, with all discovered sources.
    hostnames = list(hostnames_cache.keys())
    hostnames.sort()

    for hostname in hostnames:
        base = utils.base_domain_for(hostname)
        row = [hostname, base]
        for source in sources:
            row += [source in hostnames_cache[hostname]]
        gathered_writer.writerow(row)

    # Close CSV file.
    gathered_file.close()

    # If sort requested, sort in place by domain.
    if options.get("sort"):
        utils.sort_csv(gathered_filename)

    logging.warning("Results written to CSV.")

    # Save metadata.
    end_time = utils.local_now()
    metadata = {
        'start_time': utils.utc_timestamp(start_time),
        'end_time': utils.utc_timestamp(end_time),
        'command': start_command
    }
    utils.write(utils.json_for(metadata), "%s/meta.json" % results_dir)


# Read in parent domains from the first column of a given CSV.
def get_parent_domains(options, cache_dir="./cache"):
    parents = options.get("parents")
    if not parents:
        return None

    # If --parents is a URL, we want to download it now,
    # and then adjust the value to be the path of the cached download.
    if parents.startswith("http:") or parents.startswith("https:"):

        # Though it's saved in cache/, it will be downloaded every time.
        parents_path = os.path.join(cache_dir, "parents.csv")

        try:
            response = requests.get(parents)
            utils.write(response.text, parents_path)
        except:
            logging.error("Parent domains URL not downloaded successfully.")
            print(utils.format_last_exception())
            exit(1)

        parents = parents_path

    parent_domains = []
    with open(parents, encoding='utf-8', newline='') as csvfile:
        for row in csv.reader(csvfile):
            if (not row[0]) or (row[0].lower() == "domain") or (row[0].lower() == "domain name"):
                continue
            parent_domains.append(row[0].lower())

    return parent_domains


if __name__ == '__main__':
    options = utils.options_for_gather()
    utils.configure_logging(options)

    # Support --output flag for changing where cache/ and results/ go.
    cache_dir = utils.cache_dir(options)
    results_dir = utils.results_dir(options)
    utils.mkdir_p(cache_dir)
    utils.mkdir_p(results_dir)

    run(options, cache_dir=cache_dir, results_dir=results_dir)