bomquote/transistor

View on GitHub
transistor/persistence/exporters/xml.py

Summary

Maintainability
A
45 mins
Test Coverage
# -*- coding: utf-8 -*-
"""
transistor.persistence.exporters.xml
~~~~~~~~~~~~
This module implements classes that export (serialize) the data inside
a BaseWorker from a SplashScraper to XML.

Most of this module is heavily inspired or else copied from Scrapy. It has
been modified to fit Transistor's API in requiring a scraper and items
object. Also, Transistor only supports python 3. Otherwise, this module
generally follows Scrapy's API and uses Scrapy's documentation.

:copyright: Original scrapy.exporters from scrapy==1.5.1 is
Copyright by it's authors and further changes or contributions here are
Copyright (C) 2018 by BOM Quote Limited.
:license: Original scrapy.exporters from scrapy==1.5.1 license is found at
https://github.com/scrapy/scrapy/archive/1.5.1.zip
and further changes or contributions here are licensed under The MIT
License, see LICENSE for more details.
~~~~~~~~~~~~
"""

import sys
from xml.sax.saxutils import XMLGenerator
from transistor.persistence.item import Item
from transistor.utility.python import is_listlike
from .base import BaseItemExporter


__all__ = ['XmlItemExporter']

class XmlItemExporter(BaseItemExporter):
    """
    Exports Items in XML format to the specified file object.
    """
    def __init__(self, file, **kwargs):
        """

        :param file:  the file-like object to use for exporting the data.
        It's write method should accept bytes (a disk file opened in
        binary mode, a io.BytesIO object, etc)
        :param kwargs: root_element (str) – The name of root element in
        the exported XML.
        :param kwargs: item_element (str) – The name of each item element
        in the exported XML.

        A typical output of this exporter would be:
            <?xml version="1.0" encoding="utf-8"?>
            <items>
              <item>
                <name>Color TV</name>
                <price>1200</price>
             </item>
              <item>
                <name>DVD player</name>
                <price>200</price>
             </item>
            </items>

        Unless overridden in the serialize_field() method, multi-valued
        fields are exported by serializing each value inside a <value>
        element. This is for convenience, as multi-valued fields are
        very common.

        For example, the item:
        >>> Item(name=['John', 'Doe'], age='23')

        Would be serialized as:
            <?xml version="1.0" encoding="utf-8"?>
            <items>
              <item>
                <name>
                  <value>John</value>
                  <value>Doe</value>
                </name>
                <age>23</age>
              </item>
            </items>
        """
        super().__init__()
        self.item_element = kwargs.pop('item_element', 'item')
        self.root_element = kwargs.pop('root_element', 'items')
        self._configure(kwargs)
        if not self.encoding:
            self.encoding = 'utf-8'
        self.xg = XMLGenerator(file, encoding=self.encoding)

    def _beautify_newline(self, new_item=False):
        if self.indent is not None and (self.indent > 0 or new_item):
            self._xg_characters('\n')

    def _beautify_indent(self, depth=1):
        if self.indent:
            self._xg_characters(' ' * self.indent * depth)

    def start_exporting(self):
        self.xg.startDocument()
        self.xg.startElement(self.root_element, {})
        self._beautify_newline(new_item=True)

    def export_item(self, item):
        self._beautify_indent(depth=1)
        self.xg.startElement(self.item_element, {})
        self._beautify_newline()
        for name, value in self._get_serialized_fields(item, default_value=''):
            self._export_xml_field(name, value, depth=2)
        self._beautify_indent(depth=1)
        self.xg.endElement(self.item_element)
        self._beautify_newline(new_item=True)

    def finish_exporting(self):
        self.xg.endElement(self.root_element)
        self.xg.endDocument()

    def _export_xml_field(self, name, serialized_value, depth):
        self._beautify_indent(depth=depth)
        self.xg.startElement(name, {})
        if hasattr(serialized_value, 'items'):
            self._beautify_newline()
            for subname, value in serialized_value.items():
                self._export_xml_field(subname, value, depth=depth+1)
            self._beautify_indent(depth=depth)
        elif is_listlike(serialized_value):
            self._beautify_newline()
            for value in serialized_value:
                self._export_xml_field('value', value, depth=depth+1)
            self._beautify_indent(depth=depth)
        elif isinstance(serialized_value, str):
            self._xg_characters(serialized_value)
        else:
            self._xg_characters(str(serialized_value))
        self.xg.endElement(name)
        self._beautify_newline()

    # Workaround for https://bugs.python.org/issue17606
    if sys.version_info[:3] >= (2, 7, 4):
        def _xg_characters(self, serialized_value):
            if not isinstance(serialized_value, str):
                serialized_value = serialized_value.decode(self.encoding)
            return self.xg.characters(serialized_value)
    else:  # pragma: no cover
        def _xg_characters(self, serialized_value):
            return self.xg.characters(serialized_value)