bomquote/transistor

View on GitHub
transistor/scrapers/splash_scraper_abc.py

Summary

Maintainability
A
3 hrs
Test Coverage
# -*- coding: utf-8 -*-
"""
transistor.scrapers.splash_scraper_abc
~~~~~~~~~~~~
This module implements an abstract base class for a scraper which assumes the
use of a Splash headless browswer/javascript rendering service. It also supports
the optional use of using Crawlera smart proxy service from scrapinghub.com.

This class must be subclassed and all abstract methods implemented. To create final
scraper implementations, methods must be customized per needs to scrape a target
website.

Notes:
    - self.browser provides a class based from mechanicalsoup with similar API
    - self.browser.session provides direct access to the python-requests object
    - beautifulsoup4 methods can be used on the self.page attribute

:copyright: Copyright (C) 2018 by BOM Quote Limited
:license: The MIT License, see LICENSE for more details.
~~~~~~~~~~~~
"""

import os
import ssl
from abc import ABC, abstractmethod
from w3lib.http import basic_auth_header
from requests.utils import dict_from_cookiejar
from requests.adapters import HTTPAdapter
from pkgutil import get_data
from transistor.browsers.splash_browser import SplashBrowser


class SplashScraper(ABC):
    """
    Base class to help implement any kind of Splash or Splash + Crawlera scraper.

    Note:
        This is the abstract base class and you must subclass it to use it.
    """

    __attrs__ = [
        'auth', 'baseurl', 'browser', 'cookies', 'crawlera_user',
        'http_session_timeout', 'http_session_valid', 'LUA_SOURCE', 'max_retries',
        'name', 'number', 'referrer', 'searchurl', 'splash_args', 'splash_wait'
        'user_agent',
    ]

    # class attrs used for concurrency
    baseurl = None
    name = None
    number = None

    @abstractmethod
    def __init__(self, script:str=None, **kwargs):
        """
        Create the instance. Required attributes are listed below as kwarg
        parameters. They can be set with kwargs or else should be explicitly
        specified in a sublcass __init__ and then have a call to
        super().__init__() made.

        :param script:() your custom lua script, if any, which is passed to
        LUA_SOURCE. Pass it as a string containing the contents of the
        specified resource. One way to accomplish this is to use the
        `get_data` function from from pkgutil which returns the contents of a
        file as a binary string. Import it like `from pkgutil import get_data`
        and then use .decode('utf-8') to go from binary to string.

        :param: kwargs: <item>:str() where <item> is an appropriate keyword search
        term, like 'book_title' or 'part_number'.  Set this keyword in your subclassed
        scraper as required.

        :param kwargs: baseurl:str() the baseurl like 'http://books.toscrape.com/'

        :param kwargs: searchurl:str() a specific url to execute searches on your
        scrape target's website, if any.

        :param kwargs: referrer: str(): the referrer to include in the headers of your
        scraper. Default is 'https://www.google.com'.

        :param kwargs: user_agent:str() set a custom user-agent header string like
        user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
        (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'

        :param kwargs: name:str() give the scraper a name for easy reference.

        :param kwargs: max_retries:int() The maximum number of retries each
        connection should attempt. Note, this applies only to failed DNS
        lookups, socket connections and connection timeouts, never to requests
        where data has made it to the server. By default, Requests does not
        retry failed connections. If you need granular control over the
        conditions under which we retry a request, import urllib3's ``Retry``
        class and pass that instead.

        :param kwargs: http_session_timeout:tuple(float, float) => (3.05, 10.05)
        This first number is maximum allowed seconds to connect to our splash
        localhost server. The second timeout number is the number of seconds
        that the client will wait between bytes sent from the server (it is not the
        total timeout).

        :param kwargs: splash_wait:float() controls the time in seconds Splash will
        wait after opening a web page, before taking actions. Default 3.0 sec.

        :param kwargs: splash_args:dict(): a python dict which will be sent in a post
        request to the Splash service. This dict will serve to set the splash.args
        attributes so they are available for use in the LUA script simply by
        referencing splash.args. Default is as below:
        self.splash_args = {
                'lua_source': self.LUA_SOURCE,
                'url': url,
                'crawlera_user': self.crawlera_user,
                # sets Splash to cache the lua script, avoids sending it every request
                'cache_args': 'lua_source',
                'timeout': timeout[1],  # timeout (in seconds) for the render, 3600 max
                'session_id': 'create',
                'referrer': self.referrer if not None else "https://www.google.com",
                'searchurl': self.searchurl,
                'keyword': keyword,  # can be used in the LUA script to submit a form
                'cookies': self.cookies,
                'user_agent': self.user_agent,
                'splash_wait': self.splash_wait,
                'js_source': self.js_source
            }
        """
        super().__init__()

        # The splash lua script. Provide a custom lua script to fit your use case.
        if script:
            self.LUA_SOURCE = script
        else:
            self.LUA_SOURCE = get_data(
                'transistor',
                'scrapers/scripts/basic_splash.lua').decode('utf-8')

        # after calling super().__init__(), call self.start_http_session()

        # ------------------ kwargs ---------------- #
        # Set these as needed in your subclass with keywords or hardcoded.
        self.baseurl = kwargs.pop('baseurl', None)
        self.searchurl = kwargs.pop('searchurl', None)
        self.crawlera_user = kwargs.pop('crawlera_user', None)
        self.name = kwargs.pop('name', None)
        self.referrer = kwargs.pop('referrer', None)
        self.user_agent = kwargs.pop('user_agent',
                                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                                     "AppleWebKit/537.36 (KHTML, like Gecko) "
                                     "Chrome/73.0.3683.86 Safari/537.36")
        self.max_retries = kwargs.pop('max_retries', 5)
        self.http_session_timeout = kwargs.pop('http_session_timeout', (3.05, 10.05))
        self.splash_args = kwargs.pop('splash_args', None)
        self.splash_wait = kwargs.pop('splash_wait', 3.0)
        self.js_source = kwargs.pop('js_source', None)

        # ----- kwargs only used for testing setup ----- #
        self._test_true = kwargs.get('_test_true', False)
        self._test_page_text = kwargs.get('_test_page_text', None)
        self._test_status_code = kwargs.get('_test_status_code', None)
        self._test_url = kwargs.get('_test_url', None)
        self._test_soup_config = kwargs.get('_test_soup_config', None)
        # ----- end kwargs for testing setup ----- #

        # ------ flags for internal use --------- #
        # For example, if a public method on your scraper returns
        # None undesirably, switch the self._result flag to False.
        # Then, you can just delete scrape results if flagged False.
        self._result = True
        # ------- /end internal use flags -------- #

        # Whether we already have a valid HTTP session with the remote server
        self.http_session_valid = False

        # ssl._create_default_https_context = ssl._create_unverified_context
        self._crawlera_ca = get_data('transistor',
                            'scrapers/certs/crawlera-ca.crt').decode('utf-8')

        ssl.create_default_context(cadata=self._crawlera_ca)

        self.browser = SplashBrowser(
            soup_config={'features': 'lxml'},
            requests_adapters={'http://': HTTPAdapter(max_retries=self.max_retries)})

        self.cookies = dict_from_cookiejar(self.browser.session.cookies)

        # set the splash basic authorization
        self.auth = basic_auth_header(
            username=os.environ.get('SPLASH_USERNAME', 'user'),
            password=os.environ.get('SPLASH_PASSWORD', 'userpass'))
        self.browser.session.headers.update({'Authorization': self.auth})

    def __repr__(self):
        return f'<SplashScraper({self.name})>'

    @abstractmethod
    def start_http_session(self, url=None, **kwargs):
        """
        Start the connection.

        :param: url: The url you would like your scraper to use
        for initial landing. If you do not provide this url, but
        you do have a self.searchurl attribute set, then this method
        will default to use the self.searchurl.

        :param timeout: About the timeout parameter:

        First, you may want to review the python-request docs about timeouts.
        http://docs.python-requests.org/en/master/user/advanced/#timeouts

        If using Crawlera, the recommended timeout (2nd number) from
        crawlera docs is at least 600 seconds. In practice, I've found
        that 600.0 is too low for several websites which we scrape. In
        some cases, while using Crawlera, we need to set this second timeout
        number as high as 1200 seconds.

        If you use Crawlera, you will need to adjust the second timeout
        based on the individual website. If the website uses 150 links to
        load the web page, and you don't optimize your LUA script to drop
        unnecessary links, then you can expect to need up to
        12 seconds x 150 links = 1,800 seconds to fully load the page
        in Crawlera. Scraping is a slow process with Splash + Crawlera.

        https://support.scrapinghub.com/support/solutions/articles/22000188397-crawlera-best-practices
        """
        timeout = kwargs.pop('timeout', self.http_session_timeout)

        if self._test_true:
            self.http_session_valid = True
            return self.browser.open_fake_page(
                page_text=self._test_page_text, status_code=self._test_status_code,
                url=self._test_url, soup_config=self._test_soup_config)

        if not self.http_session_valid:
            # Send the post request to Splash
            if url is None:
                url = self.searchurl
            return self._stateful_post(url=url, splash_args=None, timeout=timeout)

    def open(self, url, splash_args:dict=None, reuse_session=False, *args, **kwargs):
        """
        Open a url page in Splash, by sending a post request to your local
        running Splash service. If you are using Crawlera, you can reuse
        the current session, by setting the reuse_session flag to True.

        This method is intended to provide flexibility in link following, after
        the initial http_connection has been made with start_http_session..

        Note:
        Generally, you should create the first network request for any subclass
        based off this SplashScraper class with start_http_session().

        But, if you do use this open() method bypassing the initial
        start_http_session call, you must explicitly provide the
        splash_args=dict(). And, you also need to set
        self.http_session_valid = True. Otherwise, this class is going to break.

        :param url: the url to open

        :param splash_args: a python dict which will be sent in a post
        request to the Splash service, which will set the dict(key, value) pairs
        to then be accessible from inside the lua script as splash.args.

        :param reuse_session: whether to use the existing crawlera session_id
        in the call to open().

        :param kwargs: timeout: see note in self.start_http_session
        :param kwargs: callback: a user defined function to call after the
        response is returned. Useful for crawling.
        """
        timeout = kwargs.pop('timeout', self.http_session_timeout)

        if reuse_session:
            self._reuse_crawlera_session()

        if splash_args:
            self.splash_args = splash_args

        return self._stateful_post(url, splash_args=splash_args, timeout=timeout,
                                   *args, **kwargs)

    def _stateful_post(self, url, *args, **kwargs):
        """
        Execute sending the post request to the local running Splash service with our
        self.browser object.

        :param url: the url to open
        :param splash_args: a python dict which will be sent in a post request
        to the Splash service. This dict will serve to set the splash.args
        attributes so they are available for use in the LUA script simply by
        referencing splash.args.
        :param timeout: see note in start_http_session() for more detail
        """
        timeout = kwargs.pop('timeout', self.http_session_timeout)
        keyword = kwargs.pop('keyword', None)
        splash_args = kwargs.pop('splash_args', self.splash_args)

        if not splash_args:
            self.splash_args = {
                'lua_source': self.LUA_SOURCE,
                'url': url,
                'crawlera_user': self.crawlera_user,
                # set Splash to cache the lua script, to avoid sending it every request
                'cache_args': 'lua_source',
                'timeout': timeout[1],  # timeout (in seconds) for the render, 3600 max
                'session_id': 'create',
                'referrer': self.referrer if not None else "https://www.google.com",
                'searchurl': self.searchurl,
                'keyword': keyword,  # can be used in the LUA script to submit a form
                'cookies': self.cookies,
                'user_agent': self.user_agent,
                'splash_wait': self.splash_wait,
                'js_source':  ";" if not self.js_source else self.js_source,
                'script': 0 if not self.js_source else 1
            }
        else:
            self.splash_args = splash_args
        response = self.browser.stateful_post(
            # 'http://localhost:8050/render.json'if self.js_source else
            'http://localhost:8050/execute',
            json=self.splash_args,
            timeout=timeout,
            verify=self._crawlera_ca,
            stream=True, *args, **kwargs)
        self.http_session_valid = True
        return response

    @property
    def session_id(self):
        """
        This is the crawlera session id which is returned by X-Crawlera-Session.

        Reusing the same crawlera session during the lifetime of this object may be
        useful, depending on your need.

        """
        return self.browser.crawlera_session

    def _reuse_crawlera_session(self):
        """
        Update the self.splash_json['session_id'] to the current session_id so
        that the current session will be extended with an self.open() request.

        """
        self.splash_args['session_id'] = self.session_id

    @property
    def page(self):
        """return the page"""
        return self.browser.get_current_page()