
View on GitHub


1 hr
Test Coverage
Basic Class to provide similar interfaces
and basic methods to all child classes
import json
import urllib.parse as up
import logging
import requests
from bs4 import BeautifulSoup

class PyLeiheWeb:
    Basic Class to provide similar interfaces
    and basic methods to all child classes
    DOMAIN = ""

    Session = None

    def __init__(self, sess=None):
        if sess is not None:
            self.Session = sess
        if self.Session is None or sess is True:
            self.Session = requests.Session()

    def reprJSON(cls):
        Creates a JSON combatable representation.

        .. HINT::
            Should be implemented individually for each class.

            json compatible representation (dict or list)
        return {
            "cls": cls.__name__

    def toJSON(self):
        Converts the current object instance to json.

        Calls the `reprJSON` method from the instance
        to get a json compatible representation
        return json.dumps(self.reprJSON())

    def toJSONFile(self, filename=""):
        Saves the json representation as a file

        Uses the functions `toJSON()` and
        thus including `reprJSON()`

            filename (str): path to the file to write
        if filename == "":
            filename = self.__class__.__name__
        with open('{}.json'.format(filename), 'w') as f:
            json.dump(self.reprJSON(), f, sort_keys=False, indent=4)

    def _loadJSONFile(cls, filename=""):
        Private method to load a JSON file and
        automatically convert it to python types.

            the json data as python types (dict or list)
        if filename == "":
            filename = cls.__name__
        with open('{}.json'.format(filename), 'r') as f:
            data = json.load(f)
        return data

    def loadFromJSON(cls, data=None, filename=""):
        Converts a typical json representation consisting of lists/dicts into
        an instance.

        For contained instances of other classes,
        their respective conversion functions are called.

        If no data is passed, the data is loaded by calling

            data (dict, list): _optional_ parsed json as dict with json comaptible
                python objects
                *if None* `_loadJSONFile` is called and data is loaded from disk
            filename (str): _optional_ the path to the json file containing the data

            new instance
        raise NotImplementedError()

    def searchNodeMultipleContain(cls, content, Node, NodeAttr, ContNode=None, ContNodeData=None):
        Searches an html text `content` for the first occurrence of an `Node`
        with the properties `NodeAttr`.
        As an additional condition it can be required
        that a certain node `ContNode` must be contained in the foudn `Node`
        with certain properties `ContNodeData`.

            content (str):  html content
            Node (str): name of the node
            NodeAttr (dict[str: str]): with the attributes of the nodes
            ContNode (str):  optional addition node wich must be inside of `Node`
            ContNodeData (dict[str: str]): with the attributes of the `ContNode`

            First node that meets the conditions
        ContNodeData = ContNodeData or {}
        soup = BeautifulSoup(content, features="html.parser")
        forms = soup.find_all(Node, attrs=NodeAttr)
        found_forms = len(forms)
        if found_forms == 0:
            return None
        if ((ContNode == "" or ContNode is None)
                and (ContNodeData == "" or not ContNodeData)):
            return forms[0]
            return next(f for f in forms if f.find(ContNode, ContNodeData))
        except StopIteration:
            return None

    def getPostFormURL(cls, content, ContNode="", curr_url=None, ContNodeData=None):
        Searches an html text `content` for the destination address
        of the first html post form.
        As an additional condition it can be required that the form should contain
        a specific `ContNode`.

            content (str): html content
            curr_url: _optional_ address of the form,
                if available this is combined with the target address
            ContNode (str): optional node wich must be inside of the form
            ContNodeData (dict[str: str]): with the attributes of the `ContNode`

            str: with the destination url of the form
        form = cls.searchNodeMultipleContain(content,
                                             NodeAttr={"method": "post"},
        if form is None:
            return None
        form_action = form.get('action')
        if curr_url is None:
            return form_action
        return up.urljoin(curr_url, form_action)

    def getURL(cls, to):
        Build a URL from the given schema, domain and target path on the server and return it.

            to (str or list[str]): path of a destination address

            `str` with the compound url
        if not isinstance(to, str):
            to = "/".join(to)
        return cls.SCHEME + "://" + cls.DOMAIN + "/" + to

    def simpleGET(self, url, **kwargs):
        Simple function to load one URL with GET.
        For detailed informations, see `simpleSession()`
        return self.simpleSession(url=url, method="get", **kwargs)

    def _get_title(self):
        title = ""
            title = self.title
        except AttributeError:
        return title

    def simpleSession(self, url, method="POST", retry=1, **kwargs):
        Simple function to load one URL with GET or POST.

            url (str or up.ParseResult): with the destination adress
            method (str): http method to acces the url,
                          currently supported: `GET` and `POST`
            **kwargs: additional configuration for `request.Session.get` or `post`

            * `None` if the data could not be loaded
            * else `requets.Respons` with the page content in

            see `requets.Response.raise_for_status` except:

                - [Errno 11004] getaddrinfo failed
                - [Errno -2] Name or service not known
                - [Errno 8] nodename nor servname

        mp = None
        # prevent recursive crash
        if retry < 0:
            return None
        method = method.upper()
        # unparse url if necessary
        if isinstance(url, up.ParseResult):
            url = up.urlunparse(url)
        # try requests and capture ConnectionError's
            mp = self.Session.request(method, url, **kwargs)
        except requests.ConnectionError as exc:
            message = str(exc)
            # reset mp return value
            mp = None
            if 'Remote end closed connection without response' in message:
                logging.warning("[%s] Remote end closed connection: %s", self._get_title(), url)
                if retry > 0:
          "Try it again (retry %i)", retry)
                    mp = self.simpleSession(url, method=method, retry=retry - 1, **kwargs)
            elif ("[Errno 11004] getaddrinfo failed" in message
                  or "[Errno -2] Name or service not known" in message
                  or "[Errno 8] nodename nor servname " in message):
                logging.warning("[%s] Hostname can't be resolved: %s",
                                str(self), url, exc_info=False)
        return mp