from eventlet import GreenPool
from crawley.multiprogramming.pool import ThreadPool

from crawley import config
from crawley.http.managers import RequestManager
from crawley.http.urls import UrlFinder
from crawley.extractors import XPathExtractor
from crawley.exceptions import AuthenticationError
from crawley.utils import url_matcher

user_crawlers = []

class CrawlerMeta(type):
        This metaclass adds the user's crawlers to a list
        used by the CLI commands.
        Abstract base crawlers won't be added.

    def __init__(cls, name, bases, dct):

        if not hasattr(cls, '__module__' ) or not cls.__module__.startswith(config.CRAWLEY_ROOT_DIR):
        super(CrawlerMeta, cls).__init__(name, bases, dct)

Pools = {'greenlets' : {'pool' : GreenPool, 'max_concurrency' : config.MAX_GREEN_POOL_SIZE },
         'threads' : {'pool' : ThreadPool, 'max_concurrency' : config.MAX_THREAD_POOL_SIZE }, }

class BaseCrawler(object):
        User's Crawlers must inherit from this class, may
        override some methods and define the start_urls list,
        the scrapers and the max crawling depth.

    __metaclass__ = CrawlerMeta

    start_urls = []
    """ A list containing the start urls for the crawler """

    allowed_urls = []
    """ A list of urls allowed for crawl """

    black_list = []
    """ A list of blocked urls which never be crawled """

    scrapers = []
    """ A list of scrapers classes """

    max_depth = -1
    """ The maximun crawling recursive level """

    max_concurrency_level = None
    """ The maximun coroutines concurrecy level """

    headers = {}
    """ The default request headers """

    requests_delay = config.REQUEST_DELAY
    """ The average delay time between requests """

    requests_deviation = config.REQUEST_DEVIATION
    """ The requests deviation time """

    extractor = None
    """ The extractor class. Default is XPathExtractor """

    post_urls = []
        The Post data for the urls. A List of tuples containing (url, data_dict)
        Example: ("", {'page' : '1', 'color' : 'blue'})

    login = None
        The login data. A tuple of (url, login_dict).
        Example: ("", {'user' : 'myuser', 'pass', 'mypassword'})

    search_all_urls = True
        If user doesn't define the get_urls method in scrapers then the crawler will search for urls
        in the current page itself depending on the [search_all_urls] attribute.

    search_hidden_urls = False
        Search for hidden urls in the whole html

    def __init__(self, sessions=None, settings=None):
            Initializes the crawler


                sessions: Database or Documents persistant sessions

                debug: indicates if the crawler logs to stdout debug info

        if sessions is None:
            sessions = []

        self.sessions = sessions
        self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True)
        self.settings = settings

        if self.extractor is None:
            self.extractor = XPathExtractor

        self.extractor = self.extractor()

        pool_type = getattr(settings, 'POOL', 'greenlets')
        pool = Pools[pool_type]

        if self.max_concurrency_level is None:
            self.max_concurrency_level = pool['max_concurrency']

        self.pool = pool['pool'](self.max_concurrency_level)
        self.request_manager = RequestManager(settings=settings, headers=self.headers, delay=self.requests_delay, deviation=self.requests_deviation)


    def _initialize_scrapers(self):
            Instanciates all the scraper classes

        self.scrapers = [scraper_class(settings=self.settings) for scraper_class in self.scrapers]

    def _make_request(self, url, data=None):
            Returns the response object from a request

                data: if this param is present it makes a POST.
        return self.request_manager.make_request(url, data, self.extractor)

    def _get_response(self, url, data=None):
            Returns the response data from a request

                data: if this param is present it makes a POST.

        for pattern, post_data in self.post_urls:
            if url_matcher(url, pattern):
                data = post_data

        return self._make_request(url, data)

    def request(self, url, data=None):

        return self._get_response(url, data=data)

    def _manage_scrapers(self, response):
            Checks if some scraper is suited for data extraction on the current url.
            If so, gets the extractor object and delegate the scraping task
            to the scraper Object
        scraped_urls = []

        for scraper in self.scrapers:

            urls = scraper.try_scrape(response)

            if urls is not None:


        return scraped_urls

    def _commit(self):
            Makes a Commit in all sessions

        for session in self.sessions:

    def _search_in_urls_list(self, urls_list, url, default=True):
            Searches an url in a list of urls

        if not urls_list:
            return default

        for pattern in urls_list:
            if url_matcher(url, pattern):
                return True

        return False

    def _validate_url(self, url):
            Validates if the url is in the crawler's [allowed_urls] list and not in [black_list].

        return self._search_in_urls_list(self.allowed_urls, url) and not self._search_in_urls_list(self.black_list, url, default=False)

    def _fetch(self, url, depth_level=0):
            Recursive url fetching.

                depth_level: The maximun recursion level
                url: The url to start crawling

        if not self._validate_url(url):

        if self.debug:
            print "-" * 80
            print "crawling -> %s" % url

            response = self._get_response(url)
        except Exception, ex:
            self.on_request_error(url, ex)

        if self.debug:
            print "-" * 80

        urls = self._manage_scrapers(response)

        if not urls:

            if self.search_all_urls:
                urls = self.get_urls(response)

        for new_url in urls:

            if depth_level >= self.max_depth and self.max_depth != -1:

            self.pool.spawn_n(self._fetch, new_url, depth_level + 1)

    def _login(self):
            If target pages are hidden behind a login then
            pass through it first.

            self.login can be None or a tuple containing
            (login_url, params_dict)
        if self.login is None:

        url, data = self.login
        if self._get_response(url, data) is None:
            raise AuthenticationError("Can't login")

    def start(self):
            Crawler's run method

        for url in self.start_urls:
            self.pool.spawn_n(self._fetch, url, depth_level=0)


    def get_urls(self, response):
            Returns a list of urls found in the current html page
        urls = set()

        finder = UrlFinder(response, self.search_hidden_urls)
        return finder.get_urls()

    #Events section

    def on_start(self):
            Override this method to do some work when the crawler starts.


    def on_finish(self):
            Override this method to do some work when the crawler finishes.


    def on_request_error(self, url, ex):
            Override this method to customize the request error handler.

        if self.debug:
            print "Request to %s returned error: %s" % (url, ex)