derwentx/feature-hunter

View on GitHub
feature_hunter/crawler.py

Summary

Maintainability
A
1 hr
Test Coverage
""" Provide crawler functions for feature_hunter. """

import json
import re

import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.selector import SelectorList
from scrapy.xlib.pydispatch import dispatcher


class GeneralizedRecordSpider(scrapy.Spider):
    name="records"

    def __init__(self, record_spec, field_specs, start_urls, *args, **kwargs):
        super(GeneralizedRecordSpider, self).__init__(*args, **kwargs)
        self.start_urls = start_urls
        self.field_specs = field_specs
        self.record_spec = record_spec

    # def query_response(self, response, query_type, query):
    #
    #
    # def query_selector(self, selector, query_type, query):
    #     if query_type == 'css':
    #         return selector.css(query)
    #     elif query_type == 'xpath':
    #         return selector.xpath(query)
    #     elif query_type == 'jsonpath':


    def get_field_raw(self, record_selector, field_spec):
        if field_spec.get('css'):
            return record_selector.css(field_spec['css']).extract_first()
        elif field_spec.get('xpath'):
            return record_selector.xpath(field_spec['xpath']).extract_first()
        # elif field_spec.get('jsonpath'):
        #     json_record = json.loads(response.body_as_unicode())
        #     jsonpath_expr = parse(field_spec['jsonpath'])
        #
        #     return

        return record_selector.extract()
        # print "field_raw: %s" % (repr(field_raw))

    def get_record_fields(self, record_selector):
        record_fields = {}
        for field_name, field_spec in self.field_specs.items():
            # print "processing field %s with fieldspec %s" % (field_name, str(field_spec))
            # print "respons being parsed: %s" % (record_selector.extract())
            record_fields[field_name] = None
            field_raw = self.get_field_raw(record_selector, field_spec)
            if not field_raw:
                continue
            if field_spec.get('regex'):
                # print "matching %s on %s" % (repr(field_spec.get('regex')), repr(field_raw))
                field_match = re.search(field_spec.get('regex'), field_raw)
                if not field_match:
                    continue
                record_fields[field_name] = field_match.group(1)
            else:
                record_fields[field_name] = field_raw
        return record_fields


    def parse(self, response):
        if self.record_spec.get('css'):
            record_selectors = response.css(self.record_spec['css'])
        elif self.record_spec.get('xpath'):
            record_selectors = response.xpath(self.record_spec['xpath'])
        # elif self.record_spec.get('jsonpath'):
        #     json_response = json.loads(response.body_as_unicode())
        #     jsonpath_expr = parse(self.record_spec['jsonpath'])
        #     records_raw = [json.dumps(match.value) for match in jsonpath_expr.find(json_response)]
        #     record_selectors = SelectorList([Selector(text=record_raw) for record_raw in records_raw])
        else:
            record_selectors = SelectorList()
        for record_selector in record_selectors:
            yield self.get_record_fields(record_selector)

class GeneralizedHtmlRecordSpider(GeneralizedRecordSpider):
    name = "html_records"


class GeneralizedJsonSpider(GeneralizedRecordSpider):
    name = "json_records"


    def parse(self, response):
        json_response = json.loads(response.body_as_unicode())

def get_html_crawler_records(record_spec, field_specs, start_urls, settings=None):
    if not settings:
        settings = {
            'USER_AGENT':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
            'LOG_LEVEL':'INFO'
        }
    process = CrawlerProcess(settings)
    process.crawl(
        GeneralizedHtmlRecordSpider,
        record_spec=record_spec,
        field_specs=field_specs,
        start_urls=start_urls
    )

    items = []
    def add_item(item):
        items.append(item)
    dispatcher.connect(add_item, signal=signals.item_passed)
    process.start()
    return items