|
@@ -0,0 +1,71 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
|
|
|
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
|
|
+
|
|
|
+import re
|
|
|
+import logging
|
|
|
+from lxml import etree, html
|
|
|
+from openerp import api, models
|
|
|
+
|
|
|
+_logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+
|
|
|
+class IrFieldsConverter(models.Model):
|
|
|
+ _inherit = "ir.fields.converter"
|
|
|
+
|
|
|
+ @api.model
|
|
|
+ def imgs_from_html(self, html_content, limit=None, fail=False):
|
|
|
+ """Extract all images in order from an HTML field in a generator.
|
|
|
+
|
|
|
+ :param str html_content:
|
|
|
+ HTML contents from where to extract the images.
|
|
|
+
|
|
|
+ :param int limit:
|
|
|
+ Only get up to this number of images.
|
|
|
+
|
|
|
+ :param bool fail:
|
|
|
+ If ``True``, exceptions will be raised.
|
|
|
+ """
|
|
|
+ # Parse HTML
|
|
|
+ try:
|
|
|
+ doc = html.fromstring(html_content)
|
|
|
+ except (TypeError, etree.XMLSyntaxError, etree.ParserError):
|
|
|
+ if fail:
|
|
|
+ raise
|
|
|
+ else:
|
|
|
+ _logger.exception("Failure parsing this HTML:\n%s",
|
|
|
+ html_content)
|
|
|
+ return
|
|
|
+
|
|
|
+ # Required tools
|
|
|
+ query = """
|
|
|
+ //img[@src] |
|
|
|
+ //*[contains(translate(@style, "BACKGROUND", "background"),
|
|
|
+ 'background')]
|
|
|
+ [contains(translate(@style, "URL", "url"), 'url(')]
|
|
|
+ """
|
|
|
+ rgx = r"""
|
|
|
+ url\(\s* # Start function
|
|
|
+ (?P<url>[^)]*) # URL string
|
|
|
+ \s*\) # End function
|
|
|
+ """
|
|
|
+ rgx = re.compile(rgx, re.IGNORECASE | re.VERBOSE)
|
|
|
+
|
|
|
+ # Loop through possible image URLs
|
|
|
+ for lap, element in enumerate(doc.xpath(query)):
|
|
|
+ if limit and lap >= limit:
|
|
|
+ break
|
|
|
+ if element.tag == "img":
|
|
|
+ yield element.attrib["src"]
|
|
|
+ else:
|
|
|
+ for rule in element.attrib["style"].split(";"):
|
|
|
+ # Extract background image
|
|
|
+ parts = rule.split(":", 1)
|
|
|
+ try:
|
|
|
+ if parts[0].strip().lower() in {"background",
|
|
|
+ "background-image"}:
|
|
|
+ yield (rgx.search(parts[1])
|
|
|
+ .group("url").strip("\"'"))
|
|
|
+ # Malformed CSS or no match for URL
|
|
|
+ except (IndexError, AttributeError):
|
|
|
+ pass
|