Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ Released on May 17, 2013
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
return the default DOM treebuilder, which uses ``xml.dom.minidom``.

* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
``treeadapters.sax.to_sax`` which is generic and supports any
treewalker; it also resolves all known bugs with ``dom2sax``.

* Optional heuristic character encoding detection now based on
``charade`` for Python 2.6 - 3.3 compatibility.

Expand Down
18 changes: 18 additions & 0 deletions html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,24 @@
(namespaces["mathml"], "mtext")
))

adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
"xlink:href": ("xlink", "href", namespaces["xlink"]),
"xlink:role": ("xlink", "role", namespaces["xlink"]),
"xlink:show": ("xlink", "show", namespaces["xlink"]),
"xlink:title": ("xlink", "title", namespaces["xlink"]),
"xlink:type": ("xlink", "type", namespaces["xlink"]),
"xml:base": ("xml", "base", namespaces["xml"]),
"xml:lang": ("xml", "lang", namespaces["xml"]),
"xml:space": ("xml", "space", namespaces["xml"]),
"xmlns": (None, "xmlns", namespaces["xmlns"]),
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
}

unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
adjustForeignAttributes.items()])

spaceCharacters = frozenset((
"\t",
"\n",
Expand Down
16 changes: 2 additions & 14 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .constants import cdataElements, rcdataElements
from .constants import tokenTypes, ReparseException, namespaces
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
from .constants import adjustForeignAttributes as adjustForeignAttributesMap


def parse(doc, treebuilder="etree", encoding=None,
Expand Down Expand Up @@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token):
del token["data"][originalName]

def adjustForeignAttributes(self, token):
replacements = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
"xlink:href": ("xlink", "href", namespaces["xlink"]),
"xlink:role": ("xlink", "role", namespaces["xlink"]),
"xlink:show": ("xlink", "show", namespaces["xlink"]),
"xlink:title": ("xlink", "title", namespaces["xlink"]),
"xlink:type": ("xlink", "type", namespaces["xlink"]),
"xml:base": ("xml", "base", namespaces["xml"]),
"xml:lang": ("xml", "lang", namespaces["xml"]),
"xml:space": ("xml", "space", namespaces["xml"]),
"xmlns": (None, "xmlns", namespaces["xmlns"]),
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
}
replacements = adjustForeignAttributesMap

for originalName in token["data"].keys():
if originalName in replacements:
Expand Down
45 changes: 45 additions & 0 deletions html5lib/tests/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import codecs
import glob
import xml.sax.handler

base_path = os.path.split(__file__)[0]

Expand Down Expand Up @@ -130,3 +131,47 @@ def errorMessage(input, expected, actual):
if sys.version_info.major == 2:
msg = msg.encode("ascii", "backslashreplace")
return msg


class TracingSaxHandler(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self.visited = []

def startDocument(self):
self.visited.append('startDocument')

def endDocument(self):
self.visited.append('endDocument')

def startPrefixMapping(self, prefix, uri):
# These are ignored as their order is not guaranteed
pass

def endPrefixMapping(self, prefix):
# These are ignored as their order is not guaranteed
pass

def startElement(self, name, attrs):
self.visited.append(('startElement', name, attrs))

def endElement(self, name):
self.visited.append(('endElement', name))

def startElementNS(self, name, qname, attrs):
self.visited.append(('startElementNS', name, qname, dict(attrs)))

def endElementNS(self, name, qname):
self.visited.append(('endElementNS', name, qname))

def characters(self, content):
self.visited.append(('characters', content))

def ignorableWhitespace(self, whitespace):
self.visited.append(('ignorableWhitespace', whitespace))

def processingInstruction(self, target, data):
self.visited.append(('processingInstruction', target, data))

def skippedEntity(self, name):
self.visited.append(('skippedEntity', name))
40 changes: 40 additions & 0 deletions html5lib/tests/test_treeadapters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import absolute_import, division, unicode_literals

from . import support # flake8: noqa

import html5lib
from html5lib.treeadapters import sax
from html5lib.treewalkers import getTreeWalker


def test_to_sax():
handler = support.TracingSaxHandler()
tree = html5lib.parse("""<html xml:lang="en">
<title>Directory Listing</title>
<a href="/"><b/></p>
""", treebuilder="etree")
walker = getTreeWalker("etree")
sax.to_sax(walker(tree), handler)
expected = [
'startDocument',
('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
'html', {(None, 'xml:lang'): 'en'}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
('characters', 'Directory Listing'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
('characters', '\n '),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
('characters', '\n '),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
'endDocument',
]
assert expected == handler.visited
Empty file.
44 changes: 44 additions & 0 deletions html5lib/treeadapters/sax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import absolute_import, division, unicode_literals

from xml.sax.xmlreader import AttributesNSImpl

from ..constants import adjustForeignAttributes, unadjustForeignAttributes

prefix_mapping = {}
for prefix, localName, namespace in adjustForeignAttributes.values():
if prefix is not None:
prefix_mapping[prefix] = namespace


def to_sax(walker, handler):
"""Call SAX-like content handler based on treewalker walker"""
handler.startDocument()
for prefix, namespace in prefix_mapping.items():
handler.startPrefixMapping(prefix, namespace)

for token in walker:
type = token["type"]
if type == "Doctype":
continue
elif type in ("StartTag", "EmptyTag"):
attrs = AttributesNSImpl(token["data"],
unadjustForeignAttributes)
handler.startElementNS((token["namespace"], token["name"]),
token["name"],
attrs)
if type == "EmptyTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type == "EndTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type in ("Characters", "SpaceCharacters"):
handler.characters(token["data"])
elif type == "Comment":
pass
else:
assert False, "Unknown token type"

for prefix, namespace in prefix_mapping.items():
handler.endPrefixMapping(prefix)
handler.endDocument()
65 changes: 1 addition & 64 deletions html5lib/treebuilders/dom.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import absolute_import, division, unicode_literals


from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
from xml.dom import minidom, Node
import weakref

from . import _base
Expand Down Expand Up @@ -220,69 +220,6 @@ def serializeElement(element, indent=0):

return "\n".join(rv)

def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())

# gather namespace declarations
prefixes = []
for attrname in list(node.attributes.keys()):
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.nodeName)]

# apply namespace declarations
for attrname in list(node.attributes.keys()):
attr = node.getAttributeNode(attrname)
if attr.namespaceURI is None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if prefix in nsmap:
del attributes[(attr.namespaceURI, attr.nodeName)]
attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue

# SAX events
ns = node.namespaceURI or nsmap.get(None, None)
handler.startElementNS((ns, node.nodeName), node.nodeName, attributes)
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes:
handler.endPrefixMapping(prefix)

elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)

elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endDocument()

elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes:
dom2sax(child, handler, nsmap)

else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass

return locals()


Expand Down