LucaCappelletti94/tinycrawler

View on GitHub
tinycrawler/process/parser.py

Summary

Maintainability
A
45 mins
Test Coverage
from urllib.parse import urljoin
from .process_handler import ProcessHandler
from ..log import Log
from bs4 import BeautifulSoup
from ..statistics import Statistics
from ..robots import Robots
from ..urls import Urls
from requests import Response
import validators
from typing import Callable, Set
from multiprocessing import Event, Value
from queue import Empty, Queue


class Parser(ProcessHandler):
    def __init__(self, process_spawn_event: Event, process_callback_event: Event, pages_number: Value, urls_number: Value, responses: Queue, urls: Urls, robots: Robots, file_parser: Callable[[str, BeautifulSoup, Log], None], url_validator: Callable[[str, Log], bool], statistics: Statistics, logger: Log, follow_robots_txt: bool, parser_library: str):
        super().__init__("Parser", statistics, process_spawn_event)
        self._urls, self._responses, self._process_callback_event, self._pages_number, self._urls_number, self._robots, self._follow_robots_txt, self._file_parser,  self._url_validator, self._logger, self._parser_library = urls, responses, process_callback_event, pages_number, urls_number, robots, follow_robots_txt, file_parser, url_validator, logger, parser_library

    def _url_parser(self, page_url: str, soup: BeautifulSoup) -> Set[str]:
        urls = set()
        for a in soup.findAll("a", href=True):
            url = urljoin(page_url, a.get("href"))
            if (
                validators.url(url) and
                self._url_validator(url, self._logger) and
                (not self._follow_robots_txt or self._robots.can_fetch(url))):
                urls.add(url)
        if urls:
            n = self._urls.put(urls)
            if n > 0:
                self._urls_number.value += n
                self._statistics.add("Urls", "Total Urls", n)
                self._process_callback_event.set()
        return urls

    def _enough(self, active_processes):
        return active_processes*20 > self._pages_number.value

    def _target(self, response: Response):
        """Parse the downloaded files, cleans them and extracts urls"""
        soup = BeautifulSoup(response.text, self._parser_library)
        urls = self._url_parser(response.url, soup)
        self._file_parser(response.url, soup, urls, self._logger)

    def _get_job(self):
        response = self._responses.get_nowait()
        self._pages_number.value -= 1
        self._statistics.set("Pages", "Remaining Pages",
                             self._pages_number.value)
        return (response,)