bomquote/transistor

View on GitHub
examples/books_to_scrape/workgroup.py

Summary

Maintainability
A
0 mins
Test Coverage
# -*- coding: utf-8 -*-
"""
transistor.examples.books_to_scrape.workgroup
~~~~~~~~~~~~
This module implements a working example of a BaseWorker and BaseGroup.

:copyright: Copyright (C) 2018 by BOM Quote Limited
:license: The MIT License, see LICENSE for more details.
~~~~~~~~~~~~
"""

from transistor import BaseWorker
from examples.books_to_scrape.persistence import ndb
from transistor.persistence.newt_db.collections import SpiderList
from transistor.utility.logging import logger

class BooksWorker(BaseWorker):
    """
    A Worker wraps the custom Spider object and processes it after returning
    data from a scrape or crawl. The Worker can be combined into a Group of
    an arbitrary number of Workers, to enable gevent based asynchronous I/O.

    First, inherit from BaseWorker and then implement the pre_process_exports
    and/or post_process_exports methods, as shown below. Other methods
    that could be easily overriden include get_spider, get_spider_extractor, and
    even process_exports could be overriden if needed.

    Also, add any extra class attributes as needed here, to support your custom
    Spider and Exporters.
    """

    def pre_process_exports(self, spider, task):
        """
        A hook point for customization before process_exports method is
        called.

        In this example, we use this method to save our spider data to
        postgresql using newt.db.

        :param spider: the Scraper or Crawler object (i.e. MouseKeyScraper())
        :param task: just passing through the task item for printing.
        """
        if self.job_id != 'NONE':
            try:
                # create the list with the job name if it doesnt already exist
                ndb.root.spiders.add(self.job_id, SpiderList())
                logger.info(f'Worker {self.name}-{self.number} created a new spider '
                            f'list for {self.job_id}')
            except KeyError:
                # will be raised if there is already a list with the same job_name
                pass
            # export the scraper data to the items object
            items = self.load_items(spider)
            # save the items object to newt.db
            ndb.root.spiders[self.job_id].add(items)
            ndb.commit()
            logger.info(f'Worker {self.name}-{self.number} saved {items.__repr__()} to '
                  f'scrape_list "{self.job_id}" for task {task}.')
        else:
            # if job_id is NONE then we'll skip saving the objects
            logger.info(f'Worker {self.name}-{self.number} said job_name is {self.job_id} '
                  f'so will not save it.')

    def post_process_exports(self, spider, task):
        """
        A hook point for customization after process_exports.

        In this example, we append the returned scraper object to a
        class attribute called `events`.

        """
        self.events.append(spider)
        logger.info(f'{self.name} has {spider.stock} inventory status.')
        logger.info(f'pricing: {spider.price}')
        logger.info(f'Worker {self.name}-{self.number} finished task {task}')