delb-xml · funkyfuture · May 19, 2024 · May 19, 2024 · May 20, 2024 · May 20, 2024
diff --git a/_delb/nodes.py b/_delb/nodes.py
@@ -1893,48 +1893,6 @@ def clone(self, deep: bool = False, quick_and_unsafe: bool = False) -> TagNode:
 
         return result
 
-    def _collapse_whitespace(self, normalize_space: str = "default"):
-        with _wrapper_cache:
-            normalize_space = cast(
-                "str", self.attributes.get(XML_ATT_SPACE, normalize_space)
-            )
-
-            if normalize_space == "default":
-                for child_node in self.iterate_children():
-                    if not isinstance(child_node, TextNode):
-                        continue
-
-                    crunched = _crunch_whitespace(child_node.content)
-                    crunched_stripped = crunched.strip()
-
-                    if (
-                        crunched_stripped  # has non-whitespace content
-                        and crunched[0] == " "  # begins w/ whitespace
-                        and cast("int", child_node.index) > 0  # isn't first child
-                    ):
-                        child_node.content = f" {crunched_stripped}"
-                    elif (
-                        crunched[-1] == " "  # ends w/ whitespace
-                        and child_node is not self.first_child
-                        and child_node is not self.last_child
-                    ) or (
-                        crunched_stripped  # has non-whitespace content
-                        and crunched[-1] == " "  # ends w/ whitespace
-                        and child_node is self.first_child
-                        and child_node is not self.last_child
-                    ):
-                        child_node.content = f"{crunched.strip()} "
-                    elif len(self) == 1 and crunched == " ":
-                        # is only child and contains only whitespace
-                        child_node.content = " "
-                    else:
-                        child_node.content = crunched_stripped
-            else:
-                assert normalize_space == "preserve"
-
-            for child_node in self.iterate_children(is_tag_node):
-                cast("TagNode", child_node)._collapse_whitespace(normalize_space)
-
     def css_select(
         self, expression: str, namespaces: Optional[NamespaceDeclarations] = None
     ) -> QueryResults:
@@ -2359,8 +2317,8 @@ def parse(
         assert isinstance(parser, etree.XMLParser)
         result = _wrapper_cache(etree.fromstring(text, parser=parser))
         assert isinstance(result, TagNode)
-        if parser_options.collapse_whitespace:
-            result._collapse_whitespace()
+        if parser_options.reduce_whitespace:
+            result._reduce_whitespace()
         return result
 
     @property
@@ -2397,6 +2355,48 @@ def prepend_children(self, *node: NodeBase, clone: bool = False) -> None:
         """
         self.insert_children(0, *node, clone=clone)
 
+    def _reduce_whitespace(self, normalize_space: str = "default"):
+        with _wrapper_cache:
+            normalize_space = cast(
+                "str", self.attributes.get(XML_ATT_SPACE, normalize_space)
+            )
+
+            if normalize_space == "default":
+                for child_node in self.iterate_children():
+                    if not isinstance(child_node, TextNode):
+                        continue
+
+                    crunched = _crunch_whitespace(child_node.content)
+                    crunched_stripped = crunched.strip()
+
+                    if (
+                        crunched_stripped  # has non-whitespace content
+                        and crunched[0] == " "  # begins w/ whitespace
+                        and cast("int", child_node.index) > 0  # isn't first child
+                    ):
+                        child_node.content = f" {crunched_stripped}"
+                    elif (
+                        crunched[-1] == " "  # ends w/ whitespace
+                        and child_node is not self.first_child
+                        and child_node is not self.last_child
+                    ) or (
+                        crunched_stripped  # has non-whitespace content
+                        and crunched[-1] == " "  # ends w/ whitespace
+                        and child_node is self.first_child
+                        and child_node is not self.last_child
+                    ):
+                        child_node.content = f"{crunched_stripped} "
+                    elif len(self) == 1 and crunched == " ":
+                        # is only child and contains only whitespace
+                        child_node.content = " "
+                    else:
+                        child_node.content = crunched_stripped
+            else:
+                assert normalize_space == "preserve"
+
+            for child_node in self.iterate_children(is_tag_node):
+                cast("TagNode", child_node)._reduce_whitespace(normalize_space)
+
     def serialize(
         self,
         *,

diff --git a/_delb/parser.py b/_delb/parser.py
@@ -23,8 +23,8 @@ class ParserOptions:
     """
     The configuration options that define an XML parser's behaviour.
 
-    :param collapse_whitespace: :meth:`Collapse the content's whitespace
-                                <delb.Document.collapse_whitespace>`.
+    :param reduce_whitespace: :meth:`Reduce the content's whitespace
+                                <delb.Document.reduce_whitespace>`.
     :param remove_comments: Ignore comments.
     :param remove_processing_instructions: Don't include processing instructions in the
                                            parsed tree.
@@ -34,13 +34,13 @@ class ParserOptions:
 
     def __init__(
         self,
-        collapse_whitespace: bool = False,
+        reduce_whitespace: bool = False,
         remove_comments: bool = False,
         remove_processing_instructions: bool = False,
         resolve_entities: bool = True,
         unplugged: bool = False,
     ):
-        self.collapse_whitespace = collapse_whitespace
+        self.reduce_whitespace = reduce_whitespace
         self.remove_comments = remove_comments
         self.remove_processing_instructions = remove_processing_instructions
         self.resolve_entities = resolve_entities

diff --git a/delb/__init__.py b/delb/__init__.py
@@ -334,9 +334,9 @@ def __load_source(cls, source: Any, config: SimpleNamespace) -> TagNode:
         root = _wrapper_cache(loaded_tree.getroot())
         assert isinstance(root, TagNode)
 
-        if config.parser_options.collapse_whitespace:
+        if config.parser_options.reduce_whitespace:
             with altered_default_filters():
-                root._collapse_whitespace()
+                root._reduce_whitespace()
 
         return root
 
@@ -366,15 +366,8 @@ def clone(self) -> Document:
         return result
 
     def collapse_whitespace(self):
-        """
-        Collapses whitespace as described here:
-        https://wiki.tei-c.org/index.php/XML_Whitespace#Recommendations
-
-        Implicitly merges all neighbouring text nodes.
-        """
-        self.merge_text_nodes()
-        with altered_default_filters():
-            self.root._collapse_whitespace()
+        warn("This method was renamed to `reduce_whitespace`.", DeprecationWarning)
+        self.reduce_whitespace()
 
     def css_select(
         self, expression: str, namespaces: Optional[NamespaceDeclarations] = None
@@ -413,6 +406,19 @@ def new_tag_node(
             local_name=local_name, attributes=attributes, namespace=namespace
         )
 
+    def reduce_whitespace(self):
+        """
+        Collapses and trims whitespace as described in these `TEI recommendation`_.
+        Text in (sub-)trees with structured data should be trimmed further in
+        subsequent processing.
+        Implicitly merges all neighbouring text nodes.
+
+        .. _TEI recommendation: https://wiki.tei-c.org/index.php/XML_Whitespace
+        """
+        self.merge_text_nodes()
+        with altered_default_filters():
+            self.root._reduce_whitespace()
+
     @property
     def root(self) -> TagNode:
         """The root node of a document's *content* tree."""

diff --git a/integration-tests/test-parse-serialize-equality.py b/integration-tests/test-parse-serialize-equality.py
@@ -26,7 +26,7 @@ def parse_serialize_compare(file: Path):
 
     try:
         document = Document(
-            file, parser_options=ParserOptions(collapse_whitespace=False)
+            file, parser_options=ParserOptions(reduce_whitespace=False)
         )
     except FailedDocumentLoading as exc:
         print(

diff --git a/tests/test_document.py b/tests/test_document.py
@@ -6,7 +6,6 @@
     Document,
     DocumentMixinBase,
     TagNode,
-    TextNode,
     new_comment_node,
     new_processing_instruction_node,
 )
@@ -27,44 +26,6 @@ def test_clone():
     document.clone()
 
 
-def test_collapse_whitespace():
-    document = Document(
-        """
-    <root>
-        <title>
-            I Roy -
-            <hi>Touting I Self</hi>
-        </title>
-        <matrix xml:space="preserve">HB 243 A  Re</matrix>
-        <matrix xml:space="preserve">HB 243 B\tRe</matrix>
-    </root>
-    """
-    )
-
-    document.collapse_whitespace()
-    root = document.root
-
-    assert root.first_child.full_text == "I Roy - Touting I Self"
-    assert root.css_select("matrix")[0].full_text == "HB 243 A  Re"
-    assert root.css_select("matrix")[1].full_text == "HB 243 B\tRe"
-
-    #
-
-    document = Document(
-        '<docImprint><hi rendition="#g">Veröffentlicht im</hi> <docDate>'
-        '<hi rendition="#g">Februar</hi> 1848</docDate>.</docImprint>'
-    )
-
-    hi_1 = document.root.first_child
-    assert hi_1._etree_obj.tail == " "
-    x = hi_1.fetch_following_sibling()
-    assert isinstance(x, TextNode)
-    assert x.content == " "
-
-    document.collapse_whitespace()
-    assert document.root.full_text == "Veröffentlicht im Februar 1848."
-
-
 def test_contains():
     document_a = Document("<root><a/></root>")
     document_b = Document("<root><a/></root>")
@@ -117,14 +78,39 @@ class DocumentSubclass(Document):
     )
 
 
-def test_set_root():
-    document = Document("<root><node/></root>")
-    document.root = document.root[0].detach()
-    assert str(document) == '<?xml version="1.0" encoding="UTF-8"?><node/>'
+def test_reduce_whitespace():
+    document = Document(
+        """
+    <root>
+        <title>
+            I Roy -
+            <hi>Touting I Self</hi>
+        </title>
+        <matrix xml:space="preserve">HB 243 A  Re</matrix>
+        <matrix xml:space="preserve">HB 243 B\tRe</matrix>
+    </root>
+    """
+    )
 
-    document_2 = Document("<root><replacement/>parts</root>")
-    with pytest.raises(ValueError, match="detached node"):
-        document.root = document_2.root[0]
+    document.reduce_whitespace()
+    root = document.root
+
+    assert root.first_child.full_text == "I Roy - Touting I Self"
+    assert root.css_select("matrix")[0].full_text == "HB 243 A  Re"
+    assert root.css_select("matrix")[1].full_text == "HB 243 B\tRe"
+
+    #
+
+    document = Document(
+        '<docImprint><hi rendition="#g">Veröffentlicht im</hi> <docDate>'
+        '<hi rendition="#g">Februar</hi> 1848</docDate>.</docImprint>'
+    )
+    document.reduce_whitespace()
+    assert document.root.full_text == "Veröffentlicht im Februar 1848."
+
+    document = Document("<root><lb/>Hello <lb/> <lb/> <lb/> world!</root>")
+    document.reduce_whitespace()
+    assert document.root.full_text == "Hello    world!"
 
 
 def test_root_siblings():
@@ -157,6 +143,16 @@ def test_root_siblings():
         tail_nodes.pop(0)
 
 
+def test_set_root():
+    document = Document("<root><node/></root>")
+    document.root = document.root[0].detach()
+    assert str(document) == '<?xml version="1.0" encoding="UTF-8"?><node/>'
+
+    document_2 = Document("<root><replacement/>parts</root>")
+    with pytest.raises(ValueError, match="detached node"):
+        document.root = document_2.root[0]
+
+
 def test_xpath(files_path):
     document = Document(files_path / "tei_marx_manifestws_1848.TEI-P5.xml")
 

diff --git a/tests/test_serialization.py b/tests/test_serialization.py
@@ -114,7 +114,7 @@ def test_significant_whitespace_is_saved(result_file):
     document.save(result_file, indentation="  ")
 
     assert (
-        Document(result_file, parser_options=ParserOptions(collapse_whitespace=True))
+        Document(result_file, parser_options=ParserOptions(reduce_whitespace=True))
         .xpath("hi")
         .first.fetch_following_sibling()
         == " "
@@ -188,7 +188,7 @@ def test_that_root_siblings_are_preserved(files_path, result_file):
 
 
 def test_transparency(files_path, result_file):
-    parser_options = ParserOptions(collapse_whitespace=False)
+    parser_options = ParserOptions(reduce_whitespace=False)
     for file in (x for x in files_path.glob("[!tei_]*.xml")):
         origin = Document(file, parser_options=parser_options)
         origin.save(result_file)

diff --git a/tests/test_subclasses.py b/tests/test_subclasses.py
@@ -14,7 +14,7 @@
 class TEIDocument(Document):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.collapse_whitespace()
+        self.reduce_whitespace()
         self.text_characters = len(self.root.full_text)
 
     @staticmethod

diff --git a/tests/test_text_nodes.py b/tests/test_text_nodes.py
@@ -172,7 +172,7 @@ def test_bindings(sample_document):
 def test_construction():
     root = Document(
         "<root><node>one</node> two </root>",
-        parser_options=ParserOptions(collapse_whitespace=True),
+        parser_options=ParserOptions(reduce_whitespace=True),
     ).root
     node, two = tuple(x for x in root.iterate_children())
     one = node[0]

diff --git a/tests/test_transform.py b/tests/test_transform.py
@@ -99,7 +99,7 @@ def test_transformation_options():
             <choice><sic>taeteraetae</sic><corr>täterätä</corr></choice>
         </root>
         """,
-        parser_options=ParserOptions(collapse_whitespace=True),
+        parser_options=ParserOptions(reduce_whitespace=True),
     )
     transformation = ResolveChoice()
     result = transformation(document.root)
@@ -116,7 +116,7 @@ def test_transformation_sequence():
             <div copyOf="#d1"/>
         </root>
         """,
-        parser_options=ParserOptions(collapse_whitespace=True),
+        parser_options=ParserOptions(reduce_whitespace=True),
     )
     transformation = TransformationSequence(
         ResolveCopyOf, ResolveChoice(ResolveChoiceOptions(corr=False))
@@ -135,7 +135,7 @@ def test_transformation_sequence_sequence():
             <name>caro</name>
             <name>boudi</name>
         </cast>""",
-        parser_options=ParserOptions(collapse_whitespace=True),
+        parser_options=ParserOptions(reduce_whitespace=True),
     )
     root = Document("<doc><body><ul/></body></doc>").root
     transformation = TransformationSequence(