html5lib · gsnedders · May 5, 2013 · May 4, 2013
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -36,6 +36,10 @@ Released on May 17, 2013
   longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
   return the default DOM treebuilder, which uses ``xml.dom.minidom``.
 
+* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
+  ``treeadapters.sax.to_sax`` which is generic and supports any
+  treewalker; it also resolves all known bugs with ``dom2sax``.
+
 * Optional heuristic character encoding detection now based on
   ``charade`` for Python 2.6 - 3.3 compatibility.
 

diff --git a/html5lib/constants.py b/html5lib/constants.py
@@ -433,6 +433,24 @@
     (namespaces["mathml"], "mtext")
 ))
 
+adjustForeignAttributes = {
+    "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
+    "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
+    "xlink:href": ("xlink", "href", namespaces["xlink"]),
+    "xlink:role": ("xlink", "role", namespaces["xlink"]),
+    "xlink:show": ("xlink", "show", namespaces["xlink"]),
+    "xlink:title": ("xlink", "title", namespaces["xlink"]),
+    "xlink:type": ("xlink", "type", namespaces["xlink"]),
+    "xml:base": ("xml", "base", namespaces["xml"]),
+    "xml:lang": ("xml", "lang", namespaces["xml"]),
+    "xml:space": ("xml", "space", namespaces["xml"]),
+    "xmlns": (None, "xmlns", namespaces["xmlns"]),
+    "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
+}
+
+unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
+                                  adjustForeignAttributes.items()])
+
 spaceCharacters = frozenset((
     "\t",
     "\n",

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -17,6 +17,7 @@
 from .constants import cdataElements, rcdataElements
 from .constants import tokenTypes, ReparseException, namespaces
 from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
+from .constants import adjustForeignAttributes as adjustForeignAttributesMap
 
 
 def parse(doc, treebuilder="etree", encoding=None,
@@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token):
                 del token["data"][originalName]
 
     def adjustForeignAttributes(self, token):
-        replacements = {
-            "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
-            "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
-            "xlink:href": ("xlink", "href", namespaces["xlink"]),
-            "xlink:role": ("xlink", "role", namespaces["xlink"]),
-            "xlink:show": ("xlink", "show", namespaces["xlink"]),
-            "xlink:title": ("xlink", "title", namespaces["xlink"]),
-            "xlink:type": ("xlink", "type", namespaces["xlink"]),
-            "xml:base": ("xml", "base", namespaces["xml"]),
-            "xml:lang": ("xml", "lang", namespaces["xml"]),
-            "xml:space": ("xml", "space", namespaces["xml"]),
-            "xmlns": (None, "xmlns", namespaces["xmlns"]),
-            "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
-        }
+        replacements = adjustForeignAttributesMap
 
         for originalName in token["data"].keys():
             if originalName in replacements:

diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
@@ -4,6 +4,7 @@
 import sys
 import codecs
 import glob
+import xml.sax.handler
 
 base_path = os.path.split(__file__)[0]
 
@@ -130,3 +131,47 @@ def errorMessage(input, expected, actual):
     if sys.version_info.major == 2:
         msg = msg.encode("ascii", "backslashreplace")
     return msg
+
+
+class TracingSaxHandler(xml.sax.handler.ContentHandler):
+    def __init__(self):
+        xml.sax.handler.ContentHandler.__init__(self)
+        self.visited = []
+
+    def startDocument(self):
+        self.visited.append('startDocument')
+
+    def endDocument(self):
+        self.visited.append('endDocument')
+
+    def startPrefixMapping(self, prefix, uri):
+        # These are ignored as their order is not guaranteed
+        pass
+
+    def endPrefixMapping(self, prefix):
+        # These are ignored as their order is not guaranteed
+        pass
+
+    def startElement(self, name, attrs):
+        self.visited.append(('startElement', name, attrs))
+
+    def endElement(self, name):
+        self.visited.append(('endElement', name))
+
+    def startElementNS(self, name, qname, attrs):
+        self.visited.append(('startElementNS', name, qname, dict(attrs)))
+
+    def endElementNS(self, name, qname):
+        self.visited.append(('endElementNS', name, qname))
+
+    def characters(self, content):
+        self.visited.append(('characters', content))
+
+    def ignorableWhitespace(self, whitespace):
+        self.visited.append(('ignorableWhitespace', whitespace))
+
+    def processingInstruction(self, target, data):
+        self.visited.append(('processingInstruction', target, data))
+
+    def skippedEntity(self, name):
+        self.visited.append(('skippedEntity', name))
diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py
@@ -0,0 +1,40 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import support  # flake8: noqa
+
+import html5lib
+from html5lib.treeadapters import sax
+from html5lib.treewalkers import getTreeWalker
+
+
+def test_to_sax():
+    handler = support.TracingSaxHandler()
+    tree = html5lib.parse("""<html xml:lang="en">
+        <title>Directory Listing</title>
+        <a href="/"><b/></p>
+    """, treebuilder="etree")
+    walker = getTreeWalker("etree")
+    sax.to_sax(walker(tree), handler)
+    expected = [
+        'startDocument',
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
+            'html', {(None, 'xml:lang'): 'en'}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
+        ('characters', 'Directory Listing'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
+        ('characters', '\n        '),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
+        ('startElementNS',  ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
+        ('characters', '\n    '),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
+        'endDocument',
+    ]
+    assert expected == handler.visited
diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py
diff --git a/html5lib/treeadapters/sax.py b/html5lib/treeadapters/sax.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.sax.xmlreader import AttributesNSImpl
+
+from ..constants import adjustForeignAttributes, unadjustForeignAttributes
+
+prefix_mapping = {}
+for prefix, localName, namespace in adjustForeignAttributes.values():
+    if prefix is not None:
+        prefix_mapping[prefix] = namespace
+
+
+def to_sax(walker, handler):
+    """Call SAX-like content handler based on treewalker walker"""
+    handler.startDocument()
+    for prefix, namespace in prefix_mapping.items():
+        handler.startPrefixMapping(prefix, namespace)
+
+    for token in walker:
+        type = token["type"]
+        if type == "Doctype":
+            continue
+        elif type in ("StartTag", "EmptyTag"):
+            attrs = AttributesNSImpl(token["data"],
+                                     unadjustForeignAttributes)
+            handler.startElementNS((token["namespace"], token["name"]),
+                                   token["name"],
+                                   attrs)
+            if type == "EmptyTag":
+                handler.endElementNS((token["namespace"], token["name"]),
+                                     token["name"])
+        elif type == "EndTag":
+            handler.endElementNS((token["namespace"], token["name"]),
+                                 token["name"])
+        elif type in ("Characters", "SpaceCharacters"):
+            handler.characters(token["data"])
+        elif type == "Comment":
+            pass
+        else:
+            assert False, "Unknown token type"
+
+    for prefix, namespace in prefix_mapping.items():
+        handler.endPrefixMapping(prefix)
+    handler.endDocument()
diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals
 
 
-from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
+from xml.dom import minidom, Node
 import weakref
 
 from . import _base
@@ -220,69 +220,6 @@ def serializeElement(element, indent=0):
 
         return "\n".join(rv)
 
-    def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}):
-        if node.nodeType == Node.ELEMENT_NODE:
-            if not nsmap:
-                handler.startElement(node.nodeName, node.attributes)
-                for child in node.childNodes:
-                    dom2sax(child, handler, nsmap)
-                handler.endElement(node.nodeName)
-            else:
-                attributes = dict(node.attributes.itemsNS())
-
-                # gather namespace declarations
-                prefixes = []
-                for attrname in list(node.attributes.keys()):
-                    attr = node.getAttributeNode(attrname)
-                    if (attr.namespaceURI == XMLNS_NAMESPACE or
-                       (attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))):
-                        prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
-                        handler.startPrefixMapping(prefix, attr.nodeValue)
-                        prefixes.append(prefix)
-                        nsmap = nsmap.copy()
-                        nsmap[prefix] = attr.nodeValue
-                        del attributes[(attr.namespaceURI, attr.nodeName)]
-
-                # apply namespace declarations
-                for attrname in list(node.attributes.keys()):
-                    attr = node.getAttributeNode(attrname)
-                    if attr.namespaceURI is None and ':' in attr.nodeName:
-                        prefix = attr.nodeName.split(':')[0]
-                        if prefix in nsmap:
-                            del attributes[(attr.namespaceURI, attr.nodeName)]
-                            attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue
-
-                # SAX events
-                ns = node.namespaceURI or nsmap.get(None, None)
-                handler.startElementNS((ns, node.nodeName), node.nodeName, attributes)
-                for child in node.childNodes:
-                    dom2sax(child, handler, nsmap)
-                handler.endElementNS((ns, node.nodeName), node.nodeName)
-                for prefix in prefixes:
-                    handler.endPrefixMapping(prefix)
-
-        elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
-            handler.characters(node.nodeValue)
-
-        elif node.nodeType == Node.DOCUMENT_NODE:
-            handler.startDocument()
-            for child in node.childNodes:
-                dom2sax(child, handler, nsmap)
-            handler.endDocument()
-
-        elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
-            for child in node.childNodes:
-                dom2sax(child, handler, nsmap)
-
-        else:
-            # ATTRIBUTE_NODE
-            # ENTITY_NODE
-            # PROCESSING_INSTRUCTION_NODE
-            # COMMENT_NODE
-            # DOCUMENT_TYPE_NODE
-            # NOTATION_NODE
-            pass
-
     return locals()