crawley-project/crawley

View on GitHub
crawley/conf/templates/crawlers.tm

Summary

Maintainability
Test Coverage
from crawley.crawlers import BaseCrawler
from crawley.scrapers import BaseScraper
from crawley.extractors import XPathExtractor
from models import *

class %(project_name)sScraper(BaseScraper):

    #specify the urls that can be scraped by this class
    #example: ["%%python.org%%"] #(where %% is a wildcard)
    matching_urls = []

    def scrape(self, response):

        #parse the html and populate your model's tables here.
        #example:
        #   data = response.html.xpath("//html/body/center/div[2]/div/")
        #   ModelClass(text=data[0].text)
        pass


class %(project_name)sCrawler(BaseCrawler):

    #add your starting urls here
    #example: ["http://packages.python.org/crawley/"]
    start_urls = []

    #add your scraper classes here
    #example: [%(project_name)sScraper]
    scrapers = []

    #specify you maximum crawling depth level
    max_depth = 1

    #select your favourite HTML parsing tool
    extractor = XPathExtractor