LucaCappelletti94/tinycrawler

View on GitHub
tinycrawler/robots/robots.py

Summary

Maintainability
A
0 mins
Test Coverage
"""Handle UrlJob."""
from ..statistics import Statistics
from ..log import Log
from ..utils import get_domain
from urllib.robotparser import RobotFileParser
from ..eta import Eta


class Robots(dict):
    """Handle RobotsJob."""

    def __init__(self, robots_timeout: float):
        self._eta = Eta(robots_timeout)

    def can_fetch(self, url: str)->bool:
        """Return a bool representing if given url can be parsed.
            url:str, the url to check for.
        """
        domain = get_domain(url)
        self._validity_check(domain)
        return self[domain].can_fetch("*", url)

    def timeout(self, domain: str)->float:
        self._validity_check(domain)
        delay = self[domain].crawl_delay("*")
        requests_rate = self[domain].request_rate("*")
        requests_rate_delay = 0
        if delay is None:
            delay = 0
        if requests_rate is not None:
            requests_rate_delay = requests_rate.seconds / requests_rate.requests
        return max(delay, requests_rate_delay)

    def _validity_check(self, domain):
        if domain not in self or self._eta.is_ripe(domain):
            self._retrieve_robots_txt(domain)

    def _retrieve_robots_txt(self, domain: str):
        """Dowload robots.txt from given domain and parses it.
            domain:str, the domain from which to download the robots.txt
        """
        r = RobotFileParser("{domain}/robots.txt".format(domain=domain))
        r.read()
        self._eta.add(domain)
        self[domain] = r