crawley-project/crawley

View on GitHub
crawley/web_browser/browser.py

Summary

Maintainability
C
1 day
Test Coverage
import multiprocessing

from lxml import etree
from PyQt4 import QtCore, QtWebKit, QtGui
from baseBrowser import BaseBrowser, BaseBrowserTab, FrmBaseConfig, FrmBaseSettings
from config import DEFAULTS, SELECTED_CLASS

from crawley.crawlers.offline import OffLineCrawler
from crawley.manager.utils import get_full_template_path
from crawley.exceptions import InvalidProjectError
from crawley.extractors import XPathExtractor
from crawley.persistance.relational.connectors import connectors
from gui_project import GUIProject


class Browser(BaseBrowser):
    """
        A Browser representation

        This class overrides all the methods of the
        base class.
    """

    def __init__(self, default_url=None):

        if default_url is None:
            default_url = DEFAULTS['url']

        self.default_url = default_url
        BaseBrowser.__init__(self)
        self.add_tab()

    def current_tab(self):
        """
            Return the current tab
        """

        return self.ui.tab_pages.currentWidget()

    def browse(self):
        """
            Make a browse and call the url loader method
        """

        url = self.ui.tb_url.text() if self.ui.tb_url.text() else self.default_url
        if not DEFAULTS['protocol'] in url:
            url = "%s://%s" % (DEFAULTS['protocol'], url)
        tab = self.current_tab()
        self.ui.tb_url.setText(url)
        tab.load_url(url)

    def add_tab(self):
        """
            Add a new tab to the browser
        """

        index = self.ui.tab_pages.addTab(BrowserTab(self.ui), "New Tab")
        self.ui.tab_pages.setCurrentIndex(index)
        self.ui.tb_url.setFocus()
        self.browse()

    def tab_closed(self, index):
        """
            Triggered when the user close a tab
        """

        self.ui.tab_pages.widget(index).deleteLater()
        if self.ui.tab_pages.count() <= 1:
            self.close()

    def tab_changed(self, index):
        """
            Triggered when the current tab changes
        """

        tab = self.current_tab()
        if tab is not None and tab.url is not None:
            self.ui.tb_url.setText(tab.url)

    def show(self):
        """
            Show the main windows
        """

        BaseBrowser.show(self)


class BrowserTab(BaseBrowserTab):
    """
        A Browser Tab representation

        This class overrides all the methods of the
        base class.
    """

    def __init__(self, parent):

        BaseBrowserTab.__init__(self, parent)
        self.url = None
        self.crawler = OffLineCrawler()

    def load_bar(self, value):
        """
            Load the progress bar
        """

        self.pg_load.setValue(value)

    def loaded_bar(self, state):
        """
            Triggered when the bar finish the loading
        """

        self.pg_load.hide()
        index = self.parent.tab_pages.indexOf(self)
        self.parent.tab_pages.setTabText(index, self.html.title())
        self.parent.tab_pages.setTabIcon(index, QtWebKit.QWebSettings.iconForUrl(QtCore.QUrl(self.url)))

    def load_start(self):
        """
            Show the progress bar
        """

        self.pg_load.show()

    def load_url(self, url, selected_nodes=None):
        """
            Load the requested url in the webwiew
        """

        self.url = str(url)
        html = self.crawler._get_response(self.url)

        with open(get_full_template_path("html_template"), "r") as f:
            template = f.read()
            html = template % {'content': html, 'css_class': SELECTED_CLASS }

        if selected_nodes is not None:
            html = self._highlight_nodes(html, selected_nodes)

        self.html.setHtml(html)
        self.html.show()

    def _highlight_nodes(self, html, nodes):
        """
            Highlights the nodes selected by the user in the current page
        """

        html_tree = XPathExtractor().get_object(html)

        for xpath in nodes:

            tags = html_tree.xpath(xpath)

            if tags:

                tag = tags[0]

                classes = tag.attrib.get("class", "")
                classes = "%s %s" % (classes, SELECTED_CLASS)
                tag.attrib["class"] = classes.strip()
                tag.attrib["id"] = xpath

        return etree.tostring(html_tree.getroot(), pretty_print=True, method="html")

    def url_changed(self, url):
        """
            Update the url text box
        """

        if self.is_current():
            self.parent.tb_url.setText(self.url)
        self.url = url.toString()

    def back(self):
        """
            Back to previous page
        """

        if self.is_current():
            self.html.back()

    def ahead(self):
        """
            Go to next page
        """

        if self.is_current():
            self.html.forward()

    def reload(self):
        """
            Reload page
        """

        if self.is_current():
            self.html.reload()

    def start(self):
        """
            Starts a new project
        """

        self._start(is_new=True)

    def open(self):
        """
            Opens an existing project
        """

        self._start()

    def _start(self, is_new=False):
        """
            starts or opens a project depending on
            [is_new] parameter
        """

        if not is_new:
            dir_name = str(QtGui.QFileDialog.getExistingDirectory(self, 'Open Project'))
        else:
            dir_name = str(QtGui.QFileDialog.getSaveFileName(self, 'Start Project'))

        if not dir_name:
            return

        try:
            self.current_project = GUIProject(dir_name)
            self.current_project.set_up(self, is_new)

            self._disable_enable_project_buttons(True)

            if is_new:
                self.configure()

        except InvalidProjectError, e:

            print "%s" % e

            self._disable_enable_project_buttons(False)

    def configure(self):
        """
            Configure a project accesing the config.ini file
        """

        frm_config = FrmConfig(self, self.current_project)
        frm_config.show()

    def settings(self):
        """
            Shows the settings dialog
        """

        frm_settings = FrmSettings(self, self.current_project.settings)
        frm_settings.show()

    def save(self):
        """
            Saves a crawley project
        """

        self.generate()

    def generate(self):
        """
            Generates a DSL template
        """

        if self.is_current():

            url = self.parent.tb_url.text()

            main_frame = self.html.page().mainFrame()
            content = unicode(main_frame.toHtml())
            self.current_project.generate_template(url, content)

    def _run(self):
        """
            Run the crawler in other process
        """
        self.generate()

        self.current_project.run()
        self._disable_enable_buttons(True)

    def run(self):
        """
            Runs the current project
        """

        self._disable_enable_buttons(False, also_run=False)
        self._change_run_handler(self.run, self.stop, "Stop Crawler")

        self.process = multiprocessing.Process(target=self._run)
        self.process.start()

    def _change_run_handler(self, curr_handler, new_handler, label):
        """
            Connects the run signal to another handler
        """

        self.disconnect(self.parent.bt_run, QtCore.SIGNAL("clicked()"), curr_handler)
        self.connect(self.parent.bt_run, QtCore.SIGNAL("clicked()"), new_handler)

        self.parent.bt_run.setText(label)

    def stop(self):
        """
            Kills the running crawler process
        """

        self.process.terminate()
        self._change_run_handler(self.stop, self.run, "Run Crawler")
        self._disable_enable_buttons(True)

    def is_current(self):
        """"
            Return true if this is the current active tab
        """

        return self is self.parent.tab_pages.currentWidget()

    def _disable_enable_buttons(self, enable, also_run=True):
        """
            Disables crawley related buttons
            enable: boolean
        """

        self.parent.bt_configure.setEnabled(enable)
        self.parent.bt_start.setEnabled(enable)
        self.parent.bt_open.setEnabled(enable)
        self.parent.bt_save.setEnabled(enable)
        self.parent.bt_settings.setEnabled(enable)

        if also_run:
            self.parent.bt_run.setEnabled(enable)

    def _disable_enable_project_buttons(self, enable):
        """
            Disables crawley project related buttons
            enable: boolean
        """

        self.parent.bt_configure.setEnabled(enable)
        self.parent.bt_run.setEnabled(enable)
        self.parent.bt_settings.setEnabled(enable)
        self.parent.bt_save.setEnabled(enable)


class FrmConfig(FrmBaseConfig):
    """
        A GUI on the top of the config.ini files of crawley projects.
    """

    INFINITE = "Infinite"
    MAX_DEPTH_OPTIONS = 100

    def __init__(self, parent, current_project):
        """
            Setups the frm config window
        """

        FrmBaseConfig.__init__(self, parent)
        self.current_project = current_project

        self.config = current_project.get_configuration()
        self.config_ui.tb_start_url.setText(self.config[("crawler", "start_urls")])

        items = ["%s" % i for i in range(self.MAX_DEPTH_OPTIONS)]
        items.append(self.INFINITE)

        self.config_ui.cb_max_depth.addItems(items)

        max_depth = int(self.config[("crawler", "max_depth")])
        max_depth = self._check_infinite(max_depth, infinite_value=-1, get_index=True)

        self.config_ui.cb_max_depth.setCurrentIndex(max_depth)

    def _check_infinite(self, max_depth, infinite_value=INFINITE, get_index=False):
        """
            Check if max_depth is infinite or not
        """

        if max_depth == infinite_value:
            if get_index:
                return self.MAX_DEPTH_OPTIONS
            return -1
        return max_depth

    def ok(self):
        """
            Gets the new config file
        """

        max_depth = self.config_ui.cb_max_depth.currentText()
        max_depth = self._check_infinite(max_depth)
        self.config[("crawler", "max_depth")] = max_depth

        start_url = self.config_ui.tb_start_url.text()
        self.config[("crawler", "start_urls")] = start_url

        self.config.save()
        self.close()

    def cancel(self):
        """
            Closes the dialog
        """
        self.close()


class FrmSettings(FrmBaseSettings):
    """
        A GUI on the top of the settings.py files of crawley projects.
    """

    attrs_controls = { 'tb_name' : "DATABASE_NAME",
                       'tb_user' : "DATABASE_USER",
                       'tb_password' : "DATABASE_PASSWORD",
                       'tb_host' : "DATABASE_HOST",
                       'tb_port' : "DATABASE_PORT",
                       'tb_json' : "JSON_DOCUMENT",
                       'tb_xml' : "XML_DOCUMENT",
                       'ck_show_debug' : "SHOW_DEBUG_INFO",
                     }

    def __init__(self, parent, settings):
        """
            Setups the frm settings window
        """

        FrmBaseSettings.__init__(self, parent)
        self.settings = settings

        for control_name, attribute_name in self.attrs_controls.iteritems():

            control = getattr(self.settings_ui, control_name)

            if control_name.startswith("tb_"):
                control.setText(self._check_for_attribute(attribute_name))

            elif control_name.startswith("ck_"):
                control.setChecked(self._check_for_attribute(attribute_name))


        engine = self._check_for_attribute("DATABASE_ENGINE")

        connectors_names = []

        for i, connector in enumerate(connectors.keys()):

            connectors_names.append(connector)

            if connector == engine:
                index = i

        self.settings_ui.cb_engine.addItems(connectors_names)
        self.settings_ui.cb_engine.setCurrentIndex(index)

    def _check_for_attribute(self, attr_name):

        return getattr(self.settings, attr_name, '')

    def ok(self):
        """
            Saves the settings.py file
        """

        settings_dict = {}

        for control_name, attribute_name in self.attrs_controls.iteritems():

            control = getattr(self.settings_ui, control_name)

            if control_name.startswith("tb_"):
                settings_dict[attribute_name] = str(control.text())

            if control_name.startswith("ck_"):
                settings_dict[attribute_name] = control.isChecked()

        settings_dict["DATABASE_ENGINE"] = str(self.settings_ui.cb_engine.currentText())

        self.settings.__dict__.update(settings_dict)
        self._dump_file(settings_dict)
        self.close()

    def _dump_file(self, settings_dict):
        """
            Writes the settings_dict to a settings.py file
        """
        SEPARATOR = " = "

        with open(self.settings.__file__, 'r') as f:

            lines = [line.split(SEPARATOR) for line in f.readlines()]

        new_lines = []

        for line in lines:

            try:
                key, value = [val.strip() for val in line]
                new_value = settings_dict.get(key, None)

                if isinstance(new_value, basestring) and new_value.count("'") != 2 and new_value.count('"') != 2:
                    new_value = "'%s'" % new_value

                if new_value is None:
                    new_value = value

                new_line = "%s%s%s" % (key, SEPARATOR, new_value)
                new_lines.append(new_line)

            except ValueError:
                new_lines.append(line[0])

        stream = ""

        for line in new_lines:
            if not "\n" in line:
                line = "%s \n" % line
            stream += line

        with open(self.settings.__file__, 'w') as f:
            f.write(stream)

    def cancel(self):
        """
            Closes the dialog
        """
        self.close()