html_image_url_extractor/models/ir_fields_converter.py
# -*- coding: utf-8 -*-
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import re
import logging
from lxml import etree, html
from openerp import api, models
_logger = logging.getLogger(__name__)
class IrFieldsConverter(models.Model):
_inherit = "ir.fields.converter"
@api.model
def imgs_from_html(self, html_content, limit=None, fail=False):
"""Extract all images in order from an HTML field in a generator.
:param str html_content:
HTML contents from where to extract the images.
:param int limit:
Only get up to this number of images.
:param bool fail:
If ``True``, exceptions will be raised.
"""
# Parse HTML
try:
doc = html.fromstring(html_content)
except (TypeError, etree.XMLSyntaxError, etree.ParserError):
if fail:
raise
else:
_logger.exception("Failure parsing this HTML:\n%s",
html_content)
return
# Required tools
query = """
//img[@src] |
//*[contains(translate(@style, "BACKGROUND", "background"),
'background')]
[contains(translate(@style, "URL", "url"), 'url(')]
"""
rgx = r"""
url\(\s* # Start function
(?P<url>[^)]*) # URL string
\s*\) # End function
"""
rgx = re.compile(rgx, re.IGNORECASE | re.VERBOSE)
# Loop through possible image URLs
for lap, element in enumerate(doc.xpath(query)):
if limit and lap >= limit:
break
if element.tag == "img":
yield element.attrib["src"]
else:
for rule in element.attrib["style"].split(";"):
# Extract background image
parts = rule.split(":", 1)
try:
if parts[0].strip().lower() in {"background",
"background-image"}:
yield (rgx.search(parts[1])
.group("url").strip("\"'"))
# Malformed CSS or no match for URL
except (IndexError, AttributeError):
pass