Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add extension to expose DOM for embedded properties #208

Merged
merged 11 commits into from
Nov 30, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file.
- fix whitespace in plaintext conversion (#207)
- add srcset support (#209)
- add language support (#210)
- add extension to expose the DOM for embedded properties (#208)

## 1.1.3 - 2023-06-28
- reduce instances where photo is implied (#135)
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ obj = mf2py.parse(url="http://tommorris.org/")

### Extensions

Use `expose_dom=True` to expose the DOM of embedded properties.

---

`parse` is a convenience method that actually delegates to
Expand Down
7 changes: 5 additions & 2 deletions mf2py/parse_property.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,13 @@ def datetime(el, default_date=None):
)


def embedded(el, root_lang, document_lang, base_url=""):
def embedded(el, base_url, root_lang, document_lang, expose_dom):
"""Process e-* properties"""
for tag in el.find_all():
for attr in ("href", "src", "cite", "data", "poster"):
if attr in tag.attrs:
tag.attrs[attr] = try_urljoin(base_url, tag.attrs[attr])
prop_value = {
"html": el.decode_contents().strip(), # secret bs4 method to get innerHTML
"value": get_textContent(el, replace_img=True, base_url=base_url),
}
if lang := el.attrs.get("lang"):
Expand All @@ -110,4 +109,8 @@ def embedded(el, root_lang, document_lang, base_url=""):
prop_value["lang"] = root_lang
elif document_lang:
prop_value["lang"] = document_lang
if expose_dom:
prop_value["dom"] = el
else:
prop_value["html"] = el.decode_contents().strip()
return prop_value
11 changes: 7 additions & 4 deletions mf2py/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .version import __version__


def parse(doc=None, url=None, html_parser=None):
def parse(doc=None, url=None, html_parser=None, expose_dom=False):
"""
Parse a microformats2 document or url and return a json dictionary.

Expand All @@ -25,10 +25,11 @@ def parse(doc=None, url=None, html_parser=None):
html_parser (string): optional, select a specific HTML parser. Valid
options from the BeautifulSoup documentation are:
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
expose_dom (boolean): optional, expose the DOM of embedded properties.

Return: a json dict represented the structured data in this document.
"""
return Parser(doc, url, html_parser).to_dict()
return Parser(doc, url, html_parser, expose_dom).to_dict()


class Parser(object):
Expand All @@ -45,6 +46,7 @@ class Parser(object):
options from the BeautifulSoup documentation are:
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
defaults to "html5lib"
expose_dom (boolean): optional, expose the DOM of embedded properties.

Attributes:
useragent (string): the User-Agent string for the Parser
Expand All @@ -54,7 +56,7 @@ class Parser(object):
ua_url = "https://github.com/microformats/mf2py"
useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url)

def __init__(self, doc=None, url=None, html_parser=None):
def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
self.__url__ = None
self.__doc__ = None
self._preserve_doc = False
Expand All @@ -68,6 +70,7 @@ def __init__(self, doc=None, url=None, html_parser=None):
"version": __version__,
},
}
self.expose_dom = expose_dom
self.lang = None

# use default parser if none specified
Expand Down Expand Up @@ -372,7 +375,7 @@ def parse_props(el, root_lang):
embedded_el = copy.copy(embedded_el)
temp_fixes.rm_templates(embedded_el)
e_value = parse_property.embedded(
embedded_el, root_lang, self.lang, base_url=self.__url__
embedded_el, self.__url__, root_lang, self.lang, self.expose_dom
)

if root_class_names:
Expand Down
8 changes: 8 additions & 0 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
from unittest import TestCase

import bs4
import mock
from bs4 import BeautifulSoup

Expand Down Expand Up @@ -191,6 +192,13 @@ def test_embedded_parsing():
)


def test_embedded_exposed_dom():
result = parse_fixture("embedded.html", expose_dom=True)
content = result["items"][0]["properties"]["content"][0]
assert "html" not in content
assert isinstance(content["dom"], bs4.element.Tag)


def test_hoisting_nested_hcard():
result = parse_fixture("nested_hcards.html")
expected = [
Expand Down
Loading