diff --git a/CHANGELOG.md b/CHANGELOG.md index 996773a..387bd65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. - fix whitespace in plaintext conversion (#207) - add srcset support (#209) - add language support (#210) +- add extension to expose the DOM for embedded properties (#208) ## 1.1.3 - 2023-06-28 - reduce instances where photo is implied (#135) diff --git a/README.md b/README.md index f7e0136..47f21a9 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,8 @@ obj = mf2py.parse(url="http://tommorris.org/") ### Extensions +Use `expose_dom=True` to expose the DOM of embedded properties. + --- `parse` is a convenience method that actually delegates to diff --git a/mf2py/parse_property.py b/mf2py/parse_property.py index 30c565a..3d2222b 100644 --- a/mf2py/parse_property.py +++ b/mf2py/parse_property.py @@ -94,14 +94,13 @@ def datetime(el, default_date=None): ) -def embedded(el, root_lang, document_lang, base_url=""): +def embedded(el, base_url, root_lang, document_lang, expose_dom): """Process e-* properties""" for tag in el.find_all(): for attr in ("href", "src", "cite", "data", "poster"): if attr in tag.attrs: tag.attrs[attr] = try_urljoin(base_url, tag.attrs[attr]) prop_value = { - "html": el.decode_contents().strip(), # secret bs4 method to get innerHTML "value": get_textContent(el, replace_img=True, base_url=base_url), } if lang := el.attrs.get("lang"): @@ -110,4 +109,8 @@ def embedded(el, root_lang, document_lang, base_url=""): prop_value["lang"] = root_lang elif document_lang: prop_value["lang"] = document_lang + if expose_dom: + prop_value["dom"] = el + else: + prop_value["html"] = el.decode_contents().strip() return prop_value diff --git a/mf2py/parser.py b/mf2py/parser.py index 5cba036..9ae691f 100644 --- a/mf2py/parser.py +++ b/mf2py/parser.py @@ -12,7 +12,7 @@ from .version import __version__ -def parse(doc=None, url=None, html_parser=None): +def parse(doc=None, url=None, html_parser=None, expose_dom=False): """ Parse a microformats2 document or url and return a json dictionary. @@ -25,10 +25,11 @@ def parse(doc=None, url=None, html_parser=None): html_parser (string): optional, select a specific HTML parser. Valid options from the BeautifulSoup documentation are: "html", "xml", "html5", "lxml", "html5lib", and "html.parser" + expose_dom (boolean): optional, expose the DOM of embedded properties. Return: a json dict represented the structured data in this document. """ - return Parser(doc, url, html_parser).to_dict() + return Parser(doc, url, html_parser, expose_dom).to_dict() class Parser(object): @@ -45,6 +46,7 @@ class Parser(object): options from the BeautifulSoup documentation are: "html", "xml", "html5", "lxml", "html5lib", and "html.parser" defaults to "html5lib" + expose_dom (boolean): optional, expose the DOM of embedded properties. Attributes: useragent (string): the User-Agent string for the Parser @@ -54,7 +56,7 @@ class Parser(object): ua_url = "https://github.com/microformats/mf2py" useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url) - def __init__(self, doc=None, url=None, html_parser=None): + def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False): self.__url__ = None self.__doc__ = None self._preserve_doc = False @@ -68,6 +70,7 @@ def __init__(self, doc=None, url=None, html_parser=None): "version": __version__, }, } + self.expose_dom = expose_dom self.lang = None # use default parser if none specified @@ -372,7 +375,7 @@ def parse_props(el, root_lang): embedded_el = copy.copy(embedded_el) temp_fixes.rm_templates(embedded_el) e_value = parse_property.embedded( - embedded_el, root_lang, self.lang, base_url=self.__url__ + embedded_el, self.__url__, root_lang, self.lang, self.expose_dom ) if root_class_names: diff --git a/test/test_parser.py b/test/test_parser.py index a781619..b5b5ee9 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -3,6 +3,7 @@ import sys from unittest import TestCase +import bs4 import mock from bs4 import BeautifulSoup @@ -191,6 +192,13 @@ def test_embedded_parsing(): ) +def test_embedded_exposed_dom(): + result = parse_fixture("embedded.html", expose_dom=True) + content = result["items"][0]["properties"]["content"][0] + assert "html" not in content + assert isinstance(content["dom"], bs4.element.Tag) + + def test_hoisting_nested_hcard(): result = parse_fixture("nested_hcards.html") expected = [