scanners/third_parties.py
import logging
from utils import utils
# Evaluate third party service usage using Chrome headless.
# Can also be run in Lambda.
lambda_support = True
# Signal that this is a JS-based scan using headless Chrome.
# The scan method will be defined in third_parties.js instead.
scan_headless = True
# Use pshtt data if we have it, to either skip redirect/inactive
# domains, or to start with the canonical URL right away.
def init_domain(domain, environment, options):
cache_dir = options.get("_", {}).get("cache_dir", "./cache")
# If we have data from pshtt, skip if it's not a live domain.
if utils.domain_not_live(domain):
logging.debug("\tSkipping, domain not reachable during inspection.")
return False
# If we have data from pshtt, skip if it's just a redirector.
if utils.domain_is_redirect(domain, cache_dir=cache_dir):
logging.debug("\tSkipping, domain seen as just an external redirector during inspection.")
return False
# To scan, we need a URL, not just a domain.
url = None
if not (domain.startswith('http://') or domain.startswith('https://')):
# If we have data from pshtt, use the canonical endpoint.
if utils.domain_canonical(domain, cache_dir=cache_dir):
url = utils.domain_canonical(domain, cache_dir=cache_dir)
# Otherwise, well, whatever.
else:
url = 'https://' + domain
else:
url = domain
# Standardize by ending with a /.
url = url + "/"
return {'url': url}
# Gets the return value of scan(), convert to a CSV row.
def to_rows(data):
return [[
data['url'],
len(data['external_domains']),
str.join(" | ", data['external_domains']),
str.join(" | ", data['external_urls']),
str.join(" | ", data['nearby_domains']),
str.join(" | ", data['nearby_urls']),
str.join(" | ", data['known_services']),
str.join(" | ", data['unknown_services']),
str.join(" | ", data['page_urls']),
str.join(" | ", data['page_domains'])
]]
headers = [
'Scanned URL',
'Number of External Domains',
'External Domains',
'External URLs',
'Nearby Domains',
'Nearby URLs',
'Known Services',
'Unknown Services',
'URLs on the page',
'Domains in the URLs on the page'
]