extras/xml_tools.py from inasafe/inasafe

extras/xml_tools.py
Summary

Maintainability

0 mins
Test Coverage

Issues
"""Basic XML utilities based on minidom - the built in Document Object Model
"""

import sys
from xml.dom import minidom, Node
from safe.common.utilities import verify

def print_tree(n, indent=0):
    while n:
        # print 'nodeType', n.nodeType, Node.ELEMENT_NODE
        #if n.nodeType != Node.ELEMENT_NODE:
        #    break

        print ' '*indent,\
              'Node name: "%s",' %n.nodeName,\
              'Node type: "%s",' %n.nodeType,\
              'Node value: "%s"' %str(n.nodeValue).strip()


        print_tree(n.firstChild, indent+4)
        n = n.nextSibling


def pretty_print_tree(n, indent=0):
    print n

def parse(fid):
    """Parse XML file descriptor and return DOM object.
    """

    # FIXME (OLE): XML code should be validated against the DTD
    #validate(fid, handler)
    #doc = minidom.parse(fid, make_parser())

    fid.seek(0)
    doc = minidom.parse(fid)
    return doc


def get_elements(nodelist):
    """Return list of nodes that are ELEMENT_NODE
    """

    element_list = []
    for node in nodelist:
        if node.nodeType == Node.ELEMENT_NODE:
            element_list.append(node)

    return element_list


def get_text(nodelist):
    """Return a concatenation of text fields from list of nodes
    """

    s = ''
    for node in nodelist:
        if node.nodeType == Node.TEXT_NODE:
            s += node.nodeValue + ', '

    if len(s)>0: s = s[:-2]
    return s



def remove_whitespace(s):
    """Remove excess whitespace including newlines from string
    """
    import string
    words = s.split() # Split on whitespace

    return string.join(words)

    #return s.replace('\n', '')
    #s.translate(string.maketrans)



#----------------------------
# XML object model
#----------------------------

class XML_element(dict):
    def __init__(self,
                 tag=None,
                 value=None,
                 version='1.0',
                 encoding='iso-8859-1'):
        """
        value can be either
          * An XML_element
          * a list of XML_value
          * a text string

        """

        if isinstance(value, XML_element):
            value = [value]

        self.value = value



        if tag is None:
            tag = '?xml version="%s" encoding="%s"?' %(version, encoding)
            self.root_element = True
        else:
            self.root_element = False

        self.tag = tag




        # FIXME: It might be better to represent these objects
        # in a proper dictionary format with
        # {tag: value, ...}
        # No, tried that - it removes any notion of ordering.


    def __add__(self, other):
        return str(self) + str(other)

    def __radd__(self, other):
        return str(other) + str(self)    #Python swaps self and other

    def __repr__(self):
        return str(self)

    def __str__(self, indent=0):
        """String representation of XML element
        """

        if self.root_element is True:
            increment = 0
        else:
            increment = 4

        s = tab = ' '*indent

        s += '<%s>' %self.tag
        if isinstance(self.value, basestring):
            s += remove_whitespace(self.value)
        else:
            s += '\n'
            for e in self.value:
                s += e.__str__(indent+increment)
            s += tab

        if self.root_element is False:
            s += '</%s>\n' %self.tag

        return s


    def __getitem__(self, key):
        """Return sub-tree starting at element with tag equal to specified key
        If node is terminal, its text value will be returned instead of itself.
        This will allow expressions such as

        xmlobject['datafile']['accountable'] == 'Jane Sexton'

        If more than one element matches the given key a list of all
        matches will be returned
        """

        result = []
        for node in self.value:
            if node.tag == key:
                # print 'node tag = %s, node value = %s' %(node.tag, node.value)

                if isinstance(node.value, basestring):
                    result.append(str(node.value))
                    #return node.value
                else:
                    result.append(node)
                    #return node

        # print 'result', result
        if len(result) == 0:
            return None
        if len(result) == 1:
            return result[0]
        if len(result) > 1:
            return result


    def has_key(self, key):
        found = False
        for node in self.value:
            if node.tag == key:
                found = True

        return found


    def keys(self):
        return [str(node.tag) for node in self.value]



    def pretty_print(self, indent=0):
        """Print the document without tags using indentation
        """

        s = tab = ' '*indent
        s += '%s: ' %self.tag
        if isinstance(self.value, basestring):
            s += self.value
        else:
            s += '\n'
            for e in self.value:
                s += e.pretty_print(indent+4)
        s += '\n'

        return s


def xml2object(xml, verbose=False):
    """Generate XML object model from XML file or XML text

    This is the inverse operation to the __str__ representation
    (up to whitespace).

    Input xml can be either an
    * xml file
    * open xml file object

    Return XML_document instance.
    """

    # FIXME - can we allow xml to be string?
    # This would depend on minidom's parse function

    # Input tests
    if isinstance(xml, basestring):
        fid = open(xml)
    else:
        fid = xml

    try:
        dom = parse(fid)
    except Exception as e:
        # Throw filename into dom exception
        msg = 'XML file "%s" could not be parsed.\n' %fid.name
        msg += 'Error message from parser: "%s"' %str(e)
        raise Exception, msg

    try:
        xml_object = dom2object(dom)
    except Exception as e:
        msg = 'Could not convert %s into XML object.\n' %fid.name
        msg += str(e)
        raise Exception, msg

    return xml_object



def dom2object(node):
    """Convert DOM representation to XML_object hierarchy.
    """

    value = []
    textnode_encountered = None
    for n in node.childNodes:

        if n.nodeType == 3:
            # Child is a text element - omit the dom tag #text and
            # go straight to the text value.

            # Note - only the last text value will be recorded

            msg = 'Text element has child nodes - this shouldn\'t happen'
            verify(len(n.childNodes) == 0, msg)


            x = n.nodeValue.strip()
            if len(x) == 0:
                # Skip empty text children
                continue

            textnode_encountered = value = x
        else:
            # XML element


            if textnode_encountered is not None:
                msg = 'A text node was followed by a non-text tag. This is not allowed.\n'
                msg += 'Offending text node: "%s" ' %str(textnode_encountered)
                msg += 'was followed by node named: "<%s>"' %str(n.nodeName)
                raise Exception, msg


            value.append(dom2object(n))


    # Deal with empty elements
    if len(value) == 0: value = ''


    if node.nodeType == 9:
        # Root node (document)
        tag = None
    else:
        # Normal XML node
        tag = node.nodeName


    X = XML_element(tag=tag,
                    value=value)

    return X





    #=================== Useful print statement
    #if n.nodeType == 3 and str(n.nodeValue).strip() == '':
    #    pass
    #else:
    #    print 'Node name: "%s",' %n.nodeName,\
    #          'Node type: "%s",' %n.nodeType,\
    #          'Node value: "%s",' %str(n.nodeValue).strip(),\
    #          'Node children: %d' %len(n.childNodes)