dirhunt/crawler_url.py from Nekmo/dirhunt

dirhunt/crawler_url.py
Summary

Maintainability

2 hrs
Test Coverage

Issues
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from requests import RequestException

from dirhunt.url import Url


MAX_RESPONSE_SIZE = 1024 * 512
FLAGS_WEIGHT = {
    'blank': 4,
    'not_found.fake': 3,
    'html': 2,
}


class CrawlerUrl(object):
    def __init__(self, crawler, url, depth=3, source=None, exists=None, type=None, timeout=10):
        """

        :type crawler: Crawler
        :type depth: int Máxima recursión sin haber subido respecto esta url
        """
        self.flags = set()
        self.depth = depth
        if not isinstance(url, Url):
            url = Url(url)
        if url.is_valid():
            url.query = ''
            url.fragment = ''
        self.url = url
        self.crawler = crawler
        self.source = source
        self.exists = exists
        self.type = type
        self.timeout = timeout
        if url.is_valid() and (not url.path or url.path == '/'):
            self.type = 'directory'
        self.resp = None

    def add_self_directories(self, exists=None, type_=None):
        for url in self.url.breadcrumb():
            self.crawler.add_url(CrawlerUrl(self.crawler, url, self.depth - 1, self, exists, type_,
                                            timeout=self.timeout))
            # TODO: si no se puede añadir porque ya se ha añadido, establecer como que ya existe si la orden es exists

    def start(self):
        from dirhunt.processors import get_processor, GenericProcessor, Error, ProcessIndexOfRequest

        session = self.crawler.sessions.get_session()
        try:
            resp = session.get(self.url.url, stream=True, timeout=self.timeout, allow_redirects=False)
        except RequestException as e:
            self.crawler.results.put(Error(self, e))
            self.close()
            return self

        self.set_type(resp.headers.get('Content-Type'))
        self.flags.add(str(resp.status_code))
        text = ''
        soup = None

        processor = None
        if resp.status_code < 300 and self.maybe_directory():
            text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
            soup = BeautifulSoup(text, 'html.parser')
        if self.maybe_directory():
            processor = get_processor(resp, text, self, soup) or GenericProcessor(resp, self)
            processor.process(text, soup)
            self.crawler.results.put(processor)
            self.flags.update(processor.flags)
        if processor and isinstance(processor, ProcessIndexOfRequest):
            self.crawler.index_of_processors.append(processor)
        # TODO: Podemos fijarnos en el processor.index_file. Si existe y es un 200, entonces es que existe.
        if self.exists is None and resp.status_code < 404:
            self.exists = True
        self.add_self_directories(True if (not self.maybe_rewrite() and self.exists) else None,
                                  'directory' if not self.maybe_rewrite() else None)
        self.close()
        return self

    def set_type(self, content_type):
        from dirhunt.processors import INDEX_FILES
        if not self.type and not (content_type or '').startswith('text/html'):
            self.type = 'asset'
        if not self.type and (content_type or '').startswith('text/html') and self.url.name in INDEX_FILES:
            self.type = 'document'

    def maybe_rewrite(self):
        return self.type not in ['asset', 'directory']

    def maybe_directory(self):
        return self.type not in ['asset', 'document', 'rewrite'] or self.type in ['directory']

    def result(self):
        # Cuando se ejecuta el result() de future, si ya está processed, devolverse a sí mismo
        return self

    def weight(self):
        value = sum([FLAGS_WEIGHT.get(flag, 0) for flag in self.flags])
        if self.url and self.url.is_valid():
            value -= len(list(self.url.breadcrumb())) * 1.5
        return value

    def close(self):
        self.crawler.processed[self.url.url] = self
        del self.crawler.processing[self.url.url]