Nekmo/dirhunt

View on GitHub
dirhunt/crawler.py

Summary

Maintainability
C
7 hrs
Test Coverage
# -*- coding: utf-8 -*-
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures.thread import _python_exit
from threading import Lock, ThreadError
import datetime

import atexit
import humanize as humanize
from click import get_terminal_size

from dirhunt._compat import queue, Queue
from dirhunt.cli import random_spinner
from dirhunt.crawler_url import CrawlerUrl
from dirhunt.exceptions import EmptyError, RequestError, reraise_with_stack
from dirhunt.sessions import Sessions
from dirhunt.url_info import UrlsInfo

"""Flags importance"""


class Crawler(ThreadPoolExecutor):
    def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
                 progress_enabled=True, timeout=10):
        super(Crawler, self).__init__(max_workers)
        self.domains = set()
        self.results = Queue()
        self.index_of_processors = []
        self.sessions = Sessions()
        self.processing = {}
        self.processed = {}
        self.add_lock = Lock()
        self.spinner = random_spinner()
        self.start_dt = datetime.datetime.now()
        self.interesting_extensions = interesting_extensions or []
        self.interesting_files = interesting_files or []
        self.closing = False
        self.std = std or None
        self.progress_enabled = progress_enabled
        self.timeout = timeout

    def add_init_urls(self, *urls):
        """Add urls to queue.
        """
        for crawler_url in urls:
            if not isinstance(crawler_url, CrawlerUrl):
                crawler_url = CrawlerUrl(self, crawler_url, timeout=self.timeout)
            self.domains.add(crawler_url.url.only_domain)
            self.add_url(crawler_url)

    def in_domains(self, domain):
        initial_domain = domain
        while True:
            if domain in self.domains:
                if initial_domain != domain:
                    # subdomain
                    self.domains.add(domain)
                return True
            parts = domain.split('.')
            if len(parts) <= 2:
                return False
            domain = '.'.join(parts[1:])

    def add_url(self, crawler_url, force=False):
        """Add url to queue"""

        self.add_lock.acquire()
        url = crawler_url.url
        if not url.is_valid() or not url.only_domain or not self.in_domains(url.only_domain):
            self.add_lock.release()
            return
        if url.url in self.processing or url.url in self.processed:
            self.add_lock.release()
            return self.processing.get(url.url) or self.processed.get(url.url)

        fn = reraise_with_stack(crawler_url.start)
        if self.closing:
            self.add_lock.release()
            return
        if force:
            future = ThreadPoolExecutor(max_workers=1).submit(fn)
        else:
            future = self.submit(fn)
        self.processing[url.url] = future
        self.add_lock.release()
        return future

    def echo(self, body):
        if self.std is None:
            return
        # TODO: remove ANSI chars on is not tty
        self.std.write(str(body))
        self.std.write('\n')

    def erase(self):
        if self.std is None or not self.std.isatty():
            return
        CURSOR_UP_ONE = '\x1b[1A'
        ERASE_LINE = '\x1b[2K'
        # This can be improved. In the future we may want to define stdout/stderr with an cli option
        # fn = sys.stderr.write if sys.stderr.isatty() else sys.stdout.write
        self.std.write(CURSOR_UP_ONE + ERASE_LINE)

    def print_progress(self, finished=False):
        if not self.progress_enabled:
            # Cancel print progress on
            return
        self.echo('{} {} {}'.format(
            next(self.spinner),
            'Finished after' if finished else 'Started',
            (humanize.naturaldelta if finished else humanize.naturaltime)(datetime.datetime.now() - self.start_dt),
        ))

    def print_results(self, exclude=None, include=None):
        exclude = exclude or set()
        self.echo('Starting...')
        while True:
            result = None
            try:
                result = self.results.get(timeout=.5)
            except queue.Empty:
                pass
            self.erase()
            if result and result.maybe_directory() and not (result.crawler_url.flags & exclude) \
                    and (not include or (include & result.crawler_url.flags)):
                self.echo(result)
            self.print_progress()
            if not self.processing:
                # Ended?
                self.erase()
                self.print_progress(True)
                return

    def print_urls_info(self):
        if not self.index_of_processors:
            self.echo('No interesting files detected ¯\_(ツ)_/¯')
            return
        self.echo('━' * get_terminal_size()[0])
        UrlsInfo(self.index_of_processors, self.sessions, self.std, self._max_workers, self.progress_enabled,
                 self.timeout).start()

    def restart(self):
        try:
            self.add_lock.release()
        except (ThreadError, RuntimeError):
            pass

    def close(self):
        self.closing = True
        self.shutdown(False)
        atexit.unregister(_python_exit)