websecmap/scanners/scanner/screenshot.py
"""
Uses a docker container / service to retrieve screenshots.
Using: https://github.com/alvarcarto/url-to-pdf-api
In docker container: https://github.com/microbox/node-url-to-pdf-api
API examples:
https://github.com/alvarcarto/url-to-pdf-api
Test url:
https://url-to-pdf-api.herokuapp.com
Uses configuration setting:
SCREENSHOT_API_URL_V4
SCREENSHOT_API_URL_V6
Which defaults to:
http://screenshot_v4:1337
http://screenshot_v6:1337
"""
import logging
import urllib.parse
from datetime import datetime, timedelta
from io import BytesIO
import pytz
import requests
from celery import Task, group
from constance import config
from django.conf import settings
from django.core.files import File
from django.db.models import Q
from PIL import Image
from websecmap.celery import app
from websecmap.scanners import plannedscan
from websecmap.scanners.models import Endpoint, Screenshot
from websecmap.scanners.plannedscan import retrieve_endpoints_from_urls
from websecmap.scanners.scanner.__init__ import endpoint_filters, q_configurations_to_scan, unique_and_random
from websecmap.scanners.timeout import timeout
log = logging.getLogger(__package__)
def filter_scan(
organizations_filter: dict = dict(), urls_filter: dict = dict(), endpoints_filter: dict = dict(), **kwargs
):
# basically updates screenshots. It will ignore whatever parameter you throw at it as creating screenshots every day
# is a bit nonsense. It will update every month.
one_month_ago = datetime.now(pytz.utc) - timedelta(days=31)
# chromium also understands FTP servers and renders those
endpoints = Endpoint.objects.all().filter(
q_configurations_to_scan(level="endpoint"),
is_dead=False,
url__not_resolvable=False,
url__is_dead=False,
protocol__in=["http", "https", "ftp"],
port__in=[80, 443, 8443, 8080, 8888, 21],
)
# Without screenshot OR with a screenshot over a month ago
endpoints = endpoints.filter((Q(screenshot__isnull=True) | Q(screenshot__created_on__lt=one_month_ago)))
# It's possible to overwrite the above query also, you can add whatever you want to the normal query.
endpoints = endpoint_filters(endpoints, organizations_filter, urls_filter, endpoints_filter)
return unique_and_random([endpoint.url for endpoint in endpoints])
@app.task(queue="storage")
def plan_scan(
organizations_filter: dict = dict(), urls_filter: dict = dict(), endpoints_filter: dict = dict(), **kwargs
):
urls = filter_scan(organizations_filter, urls_filter, endpoints_filter, **kwargs)
plannedscan.request(activity="scan", scanner="screenshot", urls=urls)
@app.task(queue="storage")
def compose_planned_scan_task(**kwargs):
urls = plannedscan.pickup(activity="scan", scanner="screenshot", amount=kwargs.get("amount", 25))
return compose_scan_task(urls)
def compose_manual_scan_task(
organizations_filter: dict = dict(), urls_filter: dict = dict(), endpoints_filter: dict = dict(), **kwargs
) -> Task:
urls = filter_scan(organizations_filter, urls_filter, endpoints_filter, **kwargs)
return compose_scan_task(urls)
def compose_scan_task(urls):
endpoints, urls_without_endpoints = retrieve_endpoints_from_urls(
urls, protocols=["ftp", "http", "https"], ports=[80, 443, 8443, 8080, 8888, 21]
)
endpoints = unique_and_random(endpoints)
# remove urls that don't have the relevant endpoints anymore
for url_id in urls_without_endpoints:
plannedscan.finish("scan", "screenshot", url_id)
# prevent constance from looking up the value constantly:
v4_service = config.SCREENSHOT_API_URL_V4
v6_service = config.SCREENSHOT_API_URL_V6
log.info(f"Trying to make {len(endpoints)} screenshots.")
log.info(f"Screenshots will be stored at: {settings.MEDIA_ROOT}screenshots/")
log.info(f"IPv4 screenshot service: {v4_service}, IPv6 screenshot service: {v6_service}")
tasks = [
make_screenshot.si(v4_service, endpoint.uri_url()) | save_screenshot.s(endpoint.id)
for endpoint in endpoints
if endpoint.ip_version == 4
]
tasks += [
make_screenshot.si(v6_service, endpoint.uri_url()) | save_screenshot.s(endpoint.id)
for endpoint in endpoints
if endpoint.ip_version == 6
]
return group(tasks)
# We expect the screenshot tool to hang at non responsive urls.
@app.task(queue="screenshot", rate_limit="60/m")
def make_screenshot(service: str, endpoint_url: str):
try:
return make_screenshot_with_u2p(service, endpoint_url)
except (ConnectionError, TimeoutError) as e:
return e
except Exception as e:
return e
@timeout(20, "Took too long to make screenshot.")
def make_screenshot_with_u2p(screenshot_service: str, url: str):
get_parameters = {
"output": "screenshot",
"url": url,
"viewport.width": 1280,
"viewport.height": 720,
# also give a result when a 404, https error or whatever is given
"ignoreHttpsErrors": True,
# gives only the first 'screen' of the page, not the entire page.
"screenshot.fullPage": False,
}
api_call = f"{screenshot_service}/api/render?{urllib.parse.urlencode(get_parameters)}"
# https://2.python-requests.org/en/latest/user/quickstart/#binary-response-content
return requests.get(api_call)
@app.task(queue="storage")
def save_screenshot(response, endpoint_id):
# it might receive an exception, or "False" when there is no image data due to errors.
if isinstance(response, TimeoutError):
log.debug(
f"Received exception at save screenshot, not saving. Probably took too long to paint page due to"
f"a too complex page or a terribly slow server. This is business as usual. Details: {response}"
)
return False
if isinstance(response, Exception):
log.exception(
f"Received unexpected exception at save screenshot, not saving. "
f"Is the screenshot service available at this address? Details: {response}"
)
return False
# with an invalid / non-resolvable address, a 500 error is given by the image service.
if response.status_code == 500:
log.debug(
f"Received 500 at the API from {endpoint_id}, url probably does not resolve. "
f"This is business as usual. Not saving."
)
return False
try:
i = Image.open(BytesIO(response.content))
size = 320, 240
i.thumbnail(size, Image.ANTIALIAS)
filename = f"{settings.MEDIA_ROOT}screenshots/{endpoint_id}_latest.png"
i.save(filename, "PNG")
log.debug(f"Image saved as: {filename}.")
scr = Screenshot()
scr.created_on = datetime.now(pytz.utc).date()
scr.endpoint = endpoint_id
scr.image = File(open(filename, "rb"))
scr.filename = filename
scr.save()
log.debug(f"Saved in databased as id:{scr.pk} filename:{scr.filename}.")
except Exception as e:
log.exception(e)