#!/usr/bin/env python """This is a toy example not a serious conformance checker. In particular, it only reports parse errors when reading the document; it does not report any of the other (many) possible types of conformance errors that may exist in a HTML5 document""" import sys import urllib2 import cgi import html5lib htmlTemplate = u""" %(title)s

%(title)s

%(body)s """ def parseDocument(document): """Parse the document and return a list of errors and a parse tree""" p = html5lib.HTMLParser() tree = p.parse(document) return p.errors, cgi.escape(tree.printTree(), True) def getDocument(uri): if uri.startswith("http://") or uri.startswith("https://"): #Why is string conversion necessary here? document = "".join(urllib2.urlopen(uri).readlines())[:-1] #print "<--!%s-->"%(document,) else: raise ValueError, "Unrecognised URI type" return document def writeValid(uri, treeStr): bodyText = """

%s is valid HTML5!

Parse Tree:

%s

"""%(uri, treeStr) writeOutput(htmlTemplate%{"title":"Validation Results", "body":bodyText}) def writeInvalid(uri, treeStr, errors): errList=[] for pos, message in errors: errList.append("Line %i Col %i"%pos + " " + message) errStr = "
\n".join(errList) bodyText = """

%s is not valid HTML5

Errors:

Parse Tree:

%s

"""%(uri, errStr, treeStr) writeOutput(htmlTemplate%{"title":"Validation Results", "body":bodyText}) def writeErr(uri): bodyText = "

Failed to load URI %s

"%(uri,) writeOutput(htmlTemplate%{"title":"Error", "body":bodyText}) def writeOutput(s): print s.encode('utf-8') print "Content-type: text/html" print "" try: form = cgi.FieldStorage() uri = form.getvalue("uri") document = getDocument(uri) except: writeErr(uri) sys.exit(1) errors, tree = parseDocument(document) if errors: writeInvalid(uri, tree, errors) else: writeValid(uri, tree)