# Differences from the current specification (23 December 2006) are as follows: # * Phases and insertion modes are one concept in parser.py. # * EOF handling is slightly different to make sure ,
and # always exist. # * We also deal with content when there's no DOCTYPE. # It is expected that the specification will catch up with us in due course ;-) # # It should be trivial to add the following cases. However, we should probably # also look into comment handling and such then... # * Aelement end tag creates an empty
element when there's no
# element in scope.
# * A
element end tag creates an empty
element.
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
import gettext
_ = gettext.gettext
import tokenizer
import treebuilders
from treebuilders._base import Marker
from treebuilders import simpletree
import utils
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
from constants import headingElements, tableInsertModeElements, voidElements
class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
def __init__(self, strict = False, tree=simpletree.TreeBuilder):
"""
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
returned. This class is almost always a subclass of
html5lib.treebuilders._base.TreeBuilder
"""
# Raise an exception on the first error encountered
self.strict = strict
self.tree = tree()
self.errors = []
self.phases = {
"initial": InitialPhase(self, self.tree),
"rootElement": RootElementPhase(self, self.tree),
"beforeHead": BeforeHeadPhase(self, self.tree),
"inHead": InHeadPhase(self, self.tree),
"afterHead": AfterHeadPhase(self, self.tree),
"inBody": InBodyPhase(self, self.tree),
"inTable": InTablePhase(self, self.tree),
"inCaption": InCaptionPhase(self, self.tree),
"inColumnGroup": InColumnGroupPhase(self, self.tree),
"inTableBody": InTableBodyPhase(self, self.tree),
"inRow": InRowPhase(self, self.tree),
"inCell": InCellPhase(self, self.tree),
"inSelect": InSelectPhase(self, self.tree),
"afterBody": AfterBodyPhase(self, self.tree),
"inFrameset": InFramesetPhase(self, self.tree),
"afterFrameset": AfterFramesetPhase(self, self.tree),
"trailingEnd": TrailingEndPhase(self, self.tree)
}
def parse(self, stream, encoding=None, innerHTML=False):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
is not yet supported)
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
self.tree.reset()
self.firstStartTag = False
self.errors = []
self.phase = self.phases["initial"]
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
self.lastPhase = None
# We don't actually support innerHTML yet but this should allow
# assertations
self.innerHTML = innerHTML
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
for token in self.tokenizer:
token = self.normalizeToken(token)
type = token["type"]
method = getattr(self.phase, "process%s" % type, None)
if type in ("Characters", "SpaceCharacters", "Comment"):
method(token["data"])
elif type in ("StartTag", "Doctype"):
method(token["name"], token["data"])
elif type == "EndTag":
method(token["name"])
else:
self.parseError(token["data"])
# When the loop finishes it's EOF
self.phase.processEOF()
return self.tree.getDocument()
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append((self.tokenizer.stream.position(), data))
if self.strict:
raise ParseError
def atheistParseError(self):
"""This error is not an error"""
pass
def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """
if token["type"] == "EmptyTag":
# When a solidus (/) is encountered within a tag name what happens
# depends on whether the current tag name matches that of a void
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
if token["name"] in voidElements:
self.atheistParseError()
else:
self.parseError(_("Solidus (/) incorrectly placed in tag."))
token["type"] = "StartTag"
if token["type"] == "StartTag":
token["name"] = token["name"].translate(asciiUpper2Lower)
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
# AT When Python 2.4 is widespread we should use
# dict(reversed(token.data))
if token["data"]:
token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
for attr,value in token["data"][::-1]])
else:
token["data"] = {}
elif token["type"] == "EndTag":
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
token["name"] = token["name"].lower()
return token
def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = False
newModes = {
"select":"inSelect",
"td":"inCell",
"th":"inCell",
"tr":"inRow",
"tbody":"inTableBody",
"thead":"inTableBody",
"tfoot":"inTableBody",
"caption":"inCaption",
"colgroup":"inColumnGroup",
"table":"inTable",
"head":"inBody",
"body":"inBody",
"frameset":"inFrameset"
}
for node in self.tree.openElements[::-1]:
if node == self.tree.openElements[0]:
last = True
if node.name not in ['td', 'th']:
# XXX
assert self.innerHTML
raise NotImplementedError
# Check for conditions that should only happen in the innerHTML
# case
if node.name in ("select", "colgroup", "head", "frameset"):
# XXX
assert self.innerHTML
if node.name in newModes:
self.phase = self.phases[newModes[node.name]]
break
elif node.name == "html":
if self.tree.headPointer is None:
self.phase = self.phases["beforeHead"]
else:
self.phase = self.phases["afterHead"]
break
elif last:
self.phase = self.phases["body"]
break
class Phase(object):
"""Base class for helper object that implements each phase of processing
"""
# Order should be (they can be omitted):
# * EOF
# * Comment
# * Doctype
# * SpaceCharacters
# * Characters
# * StartTag
# - startTag* methods
# * EndTag
# - endTag* methods
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
def processEOF(self):
self.tree.generateImpliedEndTags()
if len(self.tree.openElements) > 2:
self.parser.parseError(_("Unexpected end of file. "
u"Missing closing tags."))
elif len(self.tree.openElements) == 2 and\
self.tree.openElements[1].name != "body":
# This happens for framesets or something?
self.parser.parseError(_("Unexpected end of file. Expected end "
u"tag (" + self.tree.openElements[1].name + u") first."))
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
# XXX This is not what the specification says. Not sure what to do
# here.
self.parser.parseError(_("XXX innerHTML EOF"))
# Betting ends.
def processComment(self, data):
# For most phases the following is correct. Where it's not it will be
# overridden.
self.tree.insertComment(data, self.tree.openElements[-1])
def processDoctype(self, name, error):
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
def processSpaceCharacters(self, data):
self.tree.insertText(data)
def processStartTag(self, name, attributes):
self.startTagHandler[name](name, attributes)
def startTagHtml(self, name, attributes):
if self.parser.firstStartTag == False and name == "html":
self.parser.parseError(_("html needs to be the first start tag."))
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in attributes.iteritems():
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
def processEndTag(self, name):
self.endTagHandler[name](name)
class InitialPhase(Phase):
# This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will defin
# this.
def processEOF(self):
self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEOF()
def processComment(self, data):
self.tree.insertComment(data, self.tree.document)
def processDoctype(self, name, error):
if error:
self.parser.parseError(_("Erroneous DOCTYPE."))
self.tree.insertDoctype(name)
self.parser.phase = self.parser.phases["rootElement"]
def processSpaceCharacters(self, data):
self.tree.insertText(data, self.tree.document)
def processCharacters(self, data):
self.parser.parseError(_(u"Unexpected non-space characters. "
u"Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEndTag(name)
class RootElementPhase(Phase):
# helper methods
def insertHtmlElement(self):
element = self.tree.createElement("html", {})
self.tree.openElements.append(element)
self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"]
# other
def processEOF(self):
self.insertHtmlElement()
self.parser.phase.processEOF()
def processComment(self, data):
self.tree.insertComment(data, self.tree.document)
def processSpaceCharacters(self, data):
self.tree.insertText(data, self.tree.document)
def processCharacters(self, data):
self.insertHtmlElement()
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
if name == "html":
self.parser.firstStartTag = True
self.insertHtmlElement()
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.insertHtmlElement()
self.parser.phase.processEndTag(name)
class BeforeHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("html", self.endTagHtml)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.startTagHead("head", {})
self.parser.phase.processEOF()
def processCharacters(self, data):
self.startTagHead("head", {})
self.parser.phase.processCharacters(data)
def startTagHead(self, name, attributes):
self.tree.insertElement(name, attributes)
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
def startTagOther(self, name, attributes):
self.startTagHead("head", {})
self.parser.phase.processStartTag(name, attributes)
def endTagHtml(self, name):
self.startTagHead("head", {})
self.parser.phase.processEndTag(name)
def endTagOther(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
") after the (implied) root element."))
class InHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("title", self.startTagTitle),
("style", self.startTagStyle),
("script", self.startTagScript),
(("base", "link", "meta"), self.startTagBaseLinkMeta),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self. endTagHandler = utils.MethodDispatcher([
("head", self.endTagHead),
("html", self.endTagHtml),
(("title", "style", "script"), self.endTagTitleStyleScript)
])
self.endTagHandler.default = self.endTagOther
# helper
def appendToHead(self, element):
if self.tree.headPointer is not None:
self.tree.headPointer.appendChild(element)
else:
assert self.parser.innerHTML
self.tree.openElements[-1].appendChild(element)
# the real thing
def processEOF(self):
if self.tree.openElements[-1].name in ("title", "style", "script"):
self.parser.parseError(_(u"Unexpected end of file. "
u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
self.tree.openElements.pop()
self.anythingElse()
self.parser.phase.processEOF()
def processCharacters(self, data):
if self.tree.openElements[-1].name in ("title", "style", "script"):
self.tree.insertText(data)
else:
self.anythingElse()
self.parser.phase.processCharacters(data)
def startTagHead(self, name, attributes):
self.tree.insertElement(name, attributes)
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
def startTagTitle(self, name, attributes):
element = self.tree.createElement(name, attributes)
self.appendToHead(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
def startTagStyle(self, name, attributes):
element = self.tree.createElement(name, attributes)
if self.tree.headPointer is not None and\
self.parser.phase == self.parser.phases["inHead"]:
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagScript(self, name, attributes):
element = self.tree.createElement(name, attributes)
element._flags.append("parser-inserted")
if self.tree.headPointer is not None and\
self.parser.phase == self.parser.phases["inHead"]:
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagBaseLinkMeta(self, name, attributes):
element = self.tree.createElement(name, attributes)
self.appendToHead(element)
def startTagOther(self, name, attributes):
self.anythingElse()
self.parser.phase.processStartTag(name, attributes)
def endTagHead(self, name):
if self.tree.openElements[-1].name == "head":
self.tree.openElements.pop()
else:
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
self.parser.phase = self.parser.phases["afterHead"]
def endTagHtml(self, name):
self.anythingElse()
self.parser.phase.processEndTag(name)
def endTagTitleStyleScript(self, name):
if self.tree.openElements[-1].name == name:
self.tree.openElements.pop()
else:
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
def anythingElse(self):
if self.tree.openElements[-1].name == "head":
self.endTagHead("head")
else:
self.parser.phase = self.parser.phases["afterHead"]
class AfterHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("base", "link", "meta", "script", "style", "title"),
self.startTagFromHead)
])
self.startTagHandler.default = self.startTagOther
def processEOF(self):
self.anythingElse()
self.parser.phase.processEOF()
def processCharacters(self, data):
self.anythingElse()
self.parser.phase.processCharacters(data)
def startTagBody(self, name, attributes):
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inBody"]
def startTagFrameset(self, name, attributes):
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
") that can be in head. Moved."))
self.parser.phase = self.parser.phases["inHead"]
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.anythingElse()
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.anythingElse()
self.parser.phase.processEndTag(name)
def anythingElse(self):
self.tree.insertElement("body", {})
self.parser.phase = self.parser.phases["inBody"]
class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
# the crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
#Keep a ref to this for special handling of whitespace in
self.processSpaceCharactersNonPre = self.processSpaceCharacters
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("script", "style"), self.startTagScriptStyle),
(("base", "link", "meta", "title"),
self.startTagFromHead),
("body", self.startTagBody),
(("address", "blockquote", "center", "dir", "div", "dl",
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
self.startTagCloseP),
("form", self.startTagForm),
(("li", "dd", "dt"), self.startTagListItem),
("plaintext",self.startTagPlaintext),
(headingElements, self.startTagHeading),
("a", self.startTagA),
(("b", "big", "em", "font", "i", "nobr", "s", "small", "strike",
"strong", "tt", "u"),self.startTagFormatting),
("button", self.startTagButton),
(("marquee", "object"), self.startTagMarqueeObject),
("xmp", self.startTagXmp),
("table", self.startTagTable),
(("area", "basefont", "bgsound", "br", "embed", "img", "param",
"spacer", "wbr"), self.startTagVoidFormatting),
("hr", self.startTagHr),
("image", self.startTagImage),
("input", self.startTagInput),
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
(("iframe", "noembed", "noframes", "noscript"), self.startTagCdata),
("select", self.startTagSelect),
(("caption", "col", "colgroup", "frame", "frameset", "head",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr"), self.startTagMisplaced),
(("event-source", "section", "nav", "article", "aside", "header",
"footer", "datagrid", "command"), self.startTagNew)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("p",self.endTagP),
("body",self.endTagBody),
("html",self.endTagHtml),
(("address", "blockquote", "center", "div", "dl", "fieldset",
"listing", "menu", "ol", "pre", "ul"), self.endTagBlock),
("form", self.endTagForm),
(("dd", "dt", "li"), self.endTagListItem),
(headingElements, self.endTagHeading),
(("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), self.endTagFormatting),
(("marquee", "object", "button"), self.endTagButtonMarqueeObject),
(("head", "frameset", "select", "optgroup", "option", "table",
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
"td", "th"), self.endTagMisplaced),
(("area", "basefont", "bgsound", "br", "embed", "hr", "image",
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
self.endTagNone),
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
self.endTagCdataTextAreaXmp),
(("event-source", "section", "nav", "article", "aside", "header",
"footer", "datagrid", "command"), self.endTagNew)
])
self.endTagHandler.default = self.endTagOther
# helper
def addFormattingElement(self, name, attributes):
self.tree.insertElement(name, attributes)
self.tree.activeFormattingElements.append(
self.tree.openElements[-1])
# the real deal
def processSpaceCharactersPre(self, data):
#Sometimes (start of blocks) we want to drop leading newlines
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and self.tree.openElements[-1].name == "pre"
and not self.tree.openElements[-1].hasContent()):
data = data[1:]
if data:
self.tree.insertText(data)
def processCharacters(self, data):
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
# do it for space characters.
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(data)
def startTagScriptStyle(self, name, attributes):
self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagFromHead(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
") that belongs in the head. Moved."))
self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagBody(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (body)."))
if len(self.tree.openElements) == 1 \
or self.tree.openElements[1].name != "body":
assert self.parser.innerHTML
else:
for attr, value in attributes.iteritems():
if attr not in self.tree.openElements[1].attributes:
self.tree.openElements[1].attributes[attr] = value
def startTagCloseP(self, name, attributes):
if self.tree.elementInScope("p"):
self.endTagP("p")
self.tree.insertElement(name, attributes)
if name == "pre":
self.processSpaceCharacters = self.processSpaceCharactersPre
def startTagForm(self, name, attributes):
if self.tree.formPointer:
self.parser.parseError("Unexpected start tag (form). Ignored.")
else:
if self.tree.elementInScope("p"):
self.endTagP("p")
self.tree.insertElement(name, attributes)
self.tree.formPointer = self.tree.openElements[-1]
def startTagListItem(self, name, attributes):
if self.tree.elementInScope("p"):
self.endTagP("p")
stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")}
stopName = stopNames[name]
# AT Use reversed in Python 2.4...
for i, node in enumerate(self.tree.openElements[::-1]):
if node.name in stopName:
for j in range(i+1):
self.tree.openElements.pop()
break
# Phrasing elements are all non special, non scoping, non
# formatting elements
if (node.name in (specialElements | scopingElements)
and node.name not in ("address", "div")):
break
# Always insert an tags into account here. We shouldn't imply #
but we should not throw a parse error either. Specification is # likely to be updated. if self.tree.openElements[1].name != "body": # innerHTML case self.parser.parseError() return if self.tree.openElements[-1].name != "body": self.parser.parseError(_("Unexpected end tag (body). Missing " u"end tag (" + self.tree.openElements[-1].name + ").")) self.parser.phase = self.parser.phases["afterBody"] def endTagHtml(self, name): self.endTagBody(name) if not self.parser.innerHTML: self.parser.phase.processEndTag(name) def endTagBlock(self, name): #Put us back in the right whitespace handling mode if name == "pre": self.processSpaceCharacters = self.processSpaceCharactersNonPre inScope = self.tree.elementInScope(name) if inScope: self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != name: self.parser.parseError((u"End tag (" + name + ") seen too " u"early. Expected other end tag.")) if inScope: node = self.tree.openElements.pop() while node.name != name: node = self.tree.openElements.pop() def endTagForm(self, name): self.endTagBlock(name) self.tree.formPointer = None def endTagListItem(self, name): # AT Could merge this with the Block case if self.tree.elementInScope(name): self.tree.generateImpliedEndTags(name) if self.tree.openElements[-1].name != name: self.parser.parseError((u"End tag (" + name + ") seen too " u"early. Expected other end tag.")) if self.tree.elementInScope(name): node = self.tree.openElements.pop() while node.name != name: node = self.tree.openElements.pop() def endTagHeading(self, name): for item in headingElements: if self.tree.elementInScope(item): self.tree.generateImpliedEndTags() break if self.tree.openElements[-1].name != name: self.parser.parseError((u"Unexpected end tag (" + name + "). " u"Expected other end tag.")) for item in headingElements: if self.tree.elementInScope(item): item = self.tree.openElements.pop() while item.name not in headingElements: item = self.tree.openElements.pop() break def endTagFormatting(self, name): """The much-feared adoption agency algorithm """ # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency # XXX Better parseError messages appreciated. while True: # Step 1 paragraph 1 afeElement = self.tree.elementInActiveFormattingElements(name) if not afeElement or (afeElement in self.tree.openElements and not self.tree.elementInScope(afeElement.name)): self.parser.parseError(_(u"End tag (" + name + ") violates " u" step 1, paragraph 1 of the adoption agency algorithm.")) return # Step 1 paragraph 2 elif afeElement not in self.tree.openElements: self.parser.parseError(_(u"End tag (" + name + ") violates " u" step 1, paragraph 2 of the adoption agency algorithm.")) self.tree.activeFormattingElements.remove(afeElement) return # Step 1 paragraph 3 if afeElement != self.tree.openElements[-1]: self.parser.parseError(_(u"End tag (" + name + ") violates " u" step 1, paragraph 3 of the adoption agency algorithm.")) # Step 2 # Start of the adoption agency algorithm proper afeIndex = self.tree.openElements.index(afeElement) furthestBlock = None for element in self.tree.openElements[afeIndex:]: if element.name in specialElements | scopingElements: furthestBlock = element break # Step 3 if furthestBlock is None: element = self.tree.openElements.pop() while element != afeElement: element = self.tree.openElements.pop() self.tree.activeFormattingElements.remove(element) return commonAncestor = self.tree.openElements[afeIndex-1] # Step 5 if furthestBlock.parent: furthestBlock.parent.removeChild(furthestBlock) # Step 6 # The bookmark is supposed to help us identify where to reinsert # nodes in step 12. We have to ensure that we reinsert nodes after # the node before the active formatting element. Note the bookmark # can move in step 7.4 bookmark = self.tree.activeFormattingElements.index(afeElement) # Step 7 lastNode = node = furthestBlock while True: # AT replace this with a function and recursion? # Node is element before node in open elements node = self.tree.openElements[ self.tree.openElements.index(node)-1] while node not in self.tree.activeFormattingElements: tmpNode = node node = self.tree.openElements[ self.tree.openElements.index(node)-1] self.tree.openElements.remove(tmpNode) # Step 7.3 if node == afeElement: break # Step 7.4 if lastNode == furthestBlock: # XXX should this be index(node) or index(node)+1 # Anne: I think +1 is ok. Given x = [2,3,4,5] # x.index(3) gives 1 and then x[1 +1] gives 4... bookmark = self.tree.activeFormattingElements.\ index(node) + 1 # Step 7.5 cite = node.parent if node.hasContent(): clone = node.cloneNode() # Replace node with clone self.tree.activeFormattingElements[ self.tree.activeFormattingElements.index(node)] = clone self.tree.openElements[ self.tree.openElements.index(node)] = clone node = clone # Step 7.6 # Remove lastNode from its parents, if any if lastNode.parent: lastNode.parent.removeChild(lastNode) node.appendChild(lastNode) # Step 7.7 lastNode = node # End of inner loop # Step 8 if lastNode.parent: lastNode.parent.removeChild(lastNode) commonAncestor.appendChild(lastNode) # Step 9 clone = afeElement.cloneNode() # Step 10 furthestBlock.reparentChildren(clone) # Step 11 furthestBlock.appendChild(clone) # Step 12 self.tree.activeFormattingElements.remove(afeElement) self.tree.activeFormattingElements.insert(bookmark, clone) # Step 13 self.tree.openElements.remove(afeElement) self.tree.openElements.insert( self.tree.openElements.index(furthestBlock) + 1, clone) def endTagButtonMarqueeObject(self, name): if self.tree.elementInScope(name): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != name: self.parser.parseError(_(u"Unexpected end tag (" + name +\ "). Expected other end tag first.")) if self.tree.elementInScope(name): element = self.tree.openElements.pop() while element.name != name: element = self.tree.openElements.pop() self.tree.clearActiveFormattingElements() def endTagMisplaced(self, name): # This handles elements with end tags in other insertion modes. self.parser.parseError(_(u"Unexpected end tag (" + name +\ u"). Ignored.")) def endTagNone(self, name): # This handles elements with no end tag. self.parser.parseError(_(u"This tag (" + name + u") has no end tag")) def endTagCdataTextAreaXmp(self, name): if self.tree.openElements[-1].name == name: self.tree.openElements.pop() else: self.parser.parseError(_("Unexpected end tag (" + name +\ "). Ignored.")) def endTagNew(self, name): """New HTML5 elements, "event-source", "section", "nav", "article", "aside", "header", "footer", "datagrid", "command" """ raise NotImplementedError def endTagOther(self, name): # XXX This logic should be moved into the treebuilder # AT should use reversed instead of [::-1] when Python 2.4 == True. for node in self.tree.openElements[::-1]: if node.name == name: self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != name: self.parser.parseError(_("Unexpected end tag (" + name +\ ").")) while self.tree.openElements.pop() != node: pass break else: if node.name in specialElements | scopingElements: self.parser.parseError(_(u"Unexpected end tag (" + name +\ "). Ignored.")) break class InTablePhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("caption", self.startTagCaption), ("colgroup", self.startTagColgroup), ("col", self.startTagCol), (("tbody", "tfoot", "thead"), self.startTagRowGroup), (("td", "th", "tr"), self.startTagImplyTbody), ("table", self.startTagTable) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ ("table", self.endTagTable), (("body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"), self.endTagIgnore) ]) self.endTagHandler.default = self.endTagOther # helper methods def clearStackToTableContext(self): # "clear the stack back to a table context" while self.tree.openElements[-1].name not in ("table", "html"): self.parser.parseError(_(u"Unexpected implied end tag (" +\ self.tree.openElements[-1].name + u") in the table phase.")) self.tree.openElements.pop() # When the current node is it's an innerHTML case # processing methods def processCharacters(self, data): self.parser.parseError(_(u"Unexpected non-space characters in " u"table context caused voodoo mode.")) # Make all the special element rearranging voodoo kick in self.tree.insertFromTable = True # Process the character in the "in body" mode self.parser.phases["inBody"].processCharacters(data) self.tree.insertFromTable = False def startTagCaption(self, name, attributes): self.clearStackToTableContext() self.tree.activeFormattingElements.append(Marker) self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inCaption"] def startTagColgroup(self, name, attributes): self.clearStackToTableContext() self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inColumnGroup"] def startTagCol(self, name, attributes): self.startTagColgroup("colgroup", {}) self.parser.phase.processStartTag(name, attributes) def startTagRowGroup(self, name, attributes): self.clearStackToTableContext() self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inTableBody"] def startTagImplyTbody(self, name, attributes): self.startTagRowGroup("tbody", {}) self.parser.phase.processStartTag(name, attributes) def startTagTable(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag (table) in table " u"phase. Implies end tag (table).")) self.parser.phase.processEndTag("table") if not self.parser.innerHTML: self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag (" + name + u") in " u"table context caused voodoo mode.")) # Make all the special element rearranging voodoo kick in self.tree.insertFromTable = True # Process the start tag in the "in body" mode self.parser.phases["inBody"].processStartTag(name, attributes) self.tree.insertFromTable = False def endTagTable(self, name): if self.tree.elementInScope("table", True): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "table": self.parser.parseError(_(u"Unexpected end tag (table). " u"Expected end tag (" + self.tree.openElements[-1].name +\ u").")) while self.tree.openElements[-1].name != "table": self.tree.openElements.pop() self.tree.openElements.pop() self.parser.resetInsertionMode() else: # innerHTML case self.parser.parseError() def endTagIgnore(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ "). Ignored.")) def endTagOther(self, name): self.parser.parseError(_(u"Unexpected end tag (" + name + u") in " u"table context caused voodoo mode.")) # Make all the special element rearranging voodoo kick in self.parser.insertFromTable = True # Process the end tag in the "in body" mode self.parser.phases["inBody"].processEndTag(name) self.parser.insertFromTable = False class InCaptionPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-caption def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), self.startTagTableElement) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ ("caption", self.endTagCaption), ("table", self.endTagTable), (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"), self.endTagIgnore) ]) self.endTagHandler.default = self.endTagOther def processCharacters(self, data): self.parser.phases["inBody"].processCharacters(data) def startTagTableElement(self, name, attributes): self.parser.parseError() self.parser.phase.processEndTag("caption") # XXX how do we know the tag is _always_ ignored in the innerHTML # case and therefore shouldn't be processed again? I'm not sure this # strategy makes sense... if not self.parser.innerHTML: self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.parser.phases["inBody"].processStartTag(name, attributes) def endTagCaption(self, name): if self.tree.elementInScope(name, True): # AT this code is quite similar to endTagTable in "InTable" self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "caption": self.parser.parseError(_(u"Unexpected end tag (caption). " u"Missing end tags.")) while self.tree.openElements[-1].name != "caption": self.tree.openElements.pop() self.tree.openElements.pop() self.tree.clearActiveFormattingElements() self.parser.phase = self.parser.phases["inTable"] else: # innerHTML case self.parser.parseError() def endTagTable(self, name): self.parser.parseError() self.parser.phase.processEndTag("caption") # XXX ... if not self.parser.innerHTML: self.parser.phase.processStartTag(name, attributes) def endTagIgnore(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ "). Ignored.")) def endTagOther(self, name): self.parser.phases["inBody"].processEndTag(name) class InColumnGroupPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-column def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("col", self.startTagCol) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ ("colgroup", self.endTagColgroup), ("col", self.endTagCol) ]) self.endTagHandler.default = self.endTagOther def processCharacters(self, data): self.endTagColgroup("colgroup") # XXX if not self.parser.innerHTML: self.parser.phase.processCharacters(data) def startTagCol(self, name ,attributes): self.tree.insertElement(name, attributes) self.tree.openElements.pop() def startTagOther(self, name, attributes): self.endTagColgroup("colgroup") # XXX how can be sure it's always ignored? if not self.parser.innerHTML: self.parser.phase.processStartTag(name, attributes) def endTagColgroup(self, name): if self.tree.openElements[-1].name == "html": # innerHTML case self.parser.parseError() else: self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] def endTagCol(self, name): self.parser.parseError(_(u"Unexpected end tag (col). " u"col has no end tag.")) def endTagOther(self, name): self.endTagColgroup("colgroup") # XXX how can be sure it's always ignored? if not self.parser.innerHTML: self.parser.phase.processEndTag(name) class InTableBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("tr", self.startTagTr), (("td", "th"), self.startTagTableCell), (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), ("table", self.endTagTable), (("body", "caption", "col", "colgroup", "html", "td", "th", "tr"), self.endTagIgnore) ]) self.endTagHandler.default = self.endTagOther # helper methods def clearStackToTableBodyContext(self): while self.tree.openElements[-1].name not in ("tbody", "tfoot", "thead", "html"): self.parser.parseError(_(u"Unexpected implied end tag (" +\ self.tree.openElements[-1].name + u") in the table body phase.")) self.tree.openElements.pop() # the rest def processCharacters(self,data): self.parser.phases["inTable"].processCharacters(data) def startTagTr(self, name, attributes): self.clearStackToTableBodyContext() self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inRow"] def startTagTableCell(self, name, attributes): self.parser.parseError(_(u"Unexpected table cell start tag (" +\ name + u") in the table body phase.")) self.startTagTr("tr", {}) self.parser.phase.processStartTag(name, attributes) def startTagTableOther(self, name, attributes): # XXX AT Any ideas on how to share this with endTagTable? if self.tree.elementInScope("tbody", True) or \ self.tree.elementInScope("thead", True) or \ self.tree.elementInScope("tfoot", True): self.clearStackToTableBodyContext() self.endTagTableRowGroup(self.tree.openElements[-1].name) self.parser.phase.processStartTag(name, attributes) else: # innerHTML case self.parser.parseError() def startTagOther(self, name, attributes): self.parser.phases["inTable"].processStartTag(name, attributes) def endTagTableRowGroup(self, name): if self.tree.elementInScope(name, True): self.clearStackToTableBodyContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] else: self.parser.parseError(_("Unexpected end tag (" + name +\ ") in the table body phase. Ignored.")) def endTagTable(self, name): if self.tree.elementInScope("tbody", True) or \ self.tree.elementInScope("thead", True) or \ self.tree.elementInScope("tfoot", True): self.clearStackToTableBodyContext() self.endTagTableRowGroup(self.tree.openElements[-1].name) self.parser.phase.processEndTag(name) else: # innerHTML case self.parser.parseError() def endTagIgnore(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ ") in the table body phase. Ignored.")) def endTagOther(self, name): self.parser.phases["inTable"].processEndTag(name) class InRowPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-row def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), (("td", "th"), self.startTagTableCell), (("caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr"), self.startTagTableOther) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ ("tr", self.endTagTr), ("table", self.endTagTable), (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), (("body", "caption", "col", "colgroup", "html", "td", "th"), self.endTagIgnore) ]) self.endTagHandler.default = self.endTagOther # helper methods (XXX unify this with other table helper methods) def clearStackToTableRowContext(self): while self.tree.openElements[-1].name not in ("tr", "html"): self.parser.parseError(_(u"Unexpected implied end tag (" +\ self.tree.openElements[-1].name + u") in the row phase.")) self.tree.openElements.pop() # the rest def processCharacters(self, data): self.parser.phases["inTable"].processCharacters(data) def startTagTableCell(self, name, attributes): self.clearStackToTableRowContext() self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inCell"] self.tree.activeFormattingElements.append(Marker) def startTagTableOther(self, name, attributes): self.endTagTr("tr") # XXX how are we sure it's always ignored in the innerHTML case? if not self.parser.innerHTML: self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.parser.phases["inTable"].processStartTag(name, attributes) def endTagTr(self, name): if self.tree.elementInScope("tr", True): self.clearStackToTableRowContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTableBody"] else: # innerHTML case self.parser.parseError() def endTagTable(self, name): self.endTagTr("tr") # Reprocess the current tag if the tr end tag was not ignored # XXX how are we sure it's always ignored in the innerHTML case? if not self.parser.innerHTML: self.parser.phase.processEndTag(name) def endTagTableRowGroup(self, name): if self.tree.elementInScope(name, True): self.endTagTr("tr") self.parser.phase.processEndTag(name) else: # innerHTML case self.parser.parseError() def endTagIgnore(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ u") in the row phase. Ignored.")) def endTagOther(self, name): self.parser.phases["inTable"].processEndTag(name) class InCellPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-cell def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), self.startTagTableOther) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ (("td", "th"), self.endTagTableCell), (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) ]) self.endTagHandler.default = self.endTagOther # helper def closeCell(self): if self.tree.elementInScope("td", True): self.endTagTableCell("td") elif self.tree.elementInScope("th", True): self.endTagTableCell("th") # the rest def processCharacters(self, data): self.parser.phases["inBody"].processCharacters(data) def startTagTableOther(self, name, attributes): if self.tree.elementInScope("td", True) or \ self.tree.elementInScope("th", True): self.closeCell() self.parser.phase.processStartTag(name, attributes) else: # innerHTML case self.parser.parseError() def startTagOther(self, name, attributes): self.parser.phases["inBody"].processStartTag(name, attributes) # Optimize this for subsequent invocations. Can't do this initially # because self.phases doesn't really exist at that point. self.startTagHandler.default =\ self.parser.phases["inBody"].processStartTag def endTagTableCell(self, name): if self.tree.elementInScope(name, True): self.tree.generateImpliedEndTags(name) if self.tree.openElements[-1].name != name: self.parser.parseError("Got table cell end tag (" + name +\ ") while required end tags are missing.") while True: node = self.tree.openElements.pop() if node.name == name: break else: self.tree.openElements.pop() self.tree.clearActiveFormattingElements() self.parser.phase = self.parser.phases["inRow"] else: self.parser.parseError(_("Unexpected end tag (" + name +\ "). Ignored.")) def endTagIgnore(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ "). Ignored.")) def endTagImply(self, name): if self.tree.elementInScope(name, True): self.closeCell() self.parser.phase.processEndTag(name) else: # sometimes innerHTML case self.parser.parseError() def endTagOther(self, name): self.parser.phases["inBody"].processEndTag(name) # Optimize this for subsequent invocations. Can't do this initially # because self.phases doesn't really exist at that point. self.endTagHandler.default = self.parser.phases["inBody"].processEndTag class InSelectPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("option", self.startTagOption), ("optgroup", self.startTagOptgroup), ("select", self.startTagSelect) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ ("option", self.endTagOption), ("optgroup", self.endTagOptgroup), ("select", self.endTagSelect), (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), self.endTagTableElements) ]) self.endTagHandler.default = self.endTagOther # http://www.whatwg.org/specs/web-apps/current-work/#in-select def processCharacters(self, data): self.tree.insertText(data) def startTagOption(self, name, attributes): # We need to imply if