import string try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset EOF = None contentModelFlags = { "PCDATA":0, "RCDATA":1, "CDATA":2, "PLAINTEXT":3 } scopingElements = frozenset(( "button", "caption", "html", "marquee", "object", "table", "td", "th" )) formattingElements = frozenset(( "a", "b", "big", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" )) specialElements = frozenset(( "address", "area", "base", "basefont", "bgsound", "blockquote", "body", "br", "center", "col", "colgroup", "dd", "dir", "div", "dl", "dt", "embed", "fieldset", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "iframe", "image", "img", "input", "isindex", "li", "link", "listing", "menu", "meta", "noembed", "noframes", "noscript", "ol", "optgroup", "option", "p", "param", "plaintext", "pre", "script", "select", "spacer", "style", "tbody", "textarea", "tfoot", "thead", "title", "tr", "ul", "wbr" )) spaceCharacters = frozenset(( u"\t", u"\n", u"\u000B", u"\u000C", u" ", u"\r" )) tableInsertModeElements = frozenset(( "table", "tbody", "tfoot", "thead", "tr" )) asciiLowercase = frozenset(string.ascii_lowercase) asciiUppercase = frozenset(string.ascii_uppercase) asciiLetters = frozenset(string.ascii_letters) digits = frozenset(string.digits) hexDigits = frozenset(string.hexdigits) asciiUpper2Lower = dict([(ord(c),ord(c.lower())) for c in string.ascii_uppercase]) # Heading elements need to be ordered headingElements = ( "h1", "h2", "h3", "h4", "h5", "h6" ) # XXX What about event-source and command? voidElements = frozenset(( "base", "link", "meta", "hr", "br", "img", "embed", "param", "area", "col", "input" )) # entitiesWindows1252 has to be _ordered_ and needs to have an index. It # therefore can't be a frozenset. entitiesWindows1252 = ( 8364, # 0x80 0x20AC EURO SIGN 65533, # 0x81 UNDEFINED 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS 8224, # 0x86 0x2020 DAGGER 8225, # 0x87 0x2021 DOUBLE DAGGER 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT 8240, # 0x89 0x2030 PER MILLE SIGN 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE 65533, # 0x8D UNDEFINED 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON 65533, # 0x8F UNDEFINED 65533, # 0x90 UNDEFINED 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK 8226, # 0x95 0x2022 BULLET 8211, # 0x96 0x2013 EN DASH 8212, # 0x97 0x2014 EM DASH 732, # 0x98 0x02DC SMALL TILDE 8482, # 0x99 0x2122 TRADE MARK SIGN 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE 65533, # 0x9D UNDEFINED 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS ) entities = { "AElig": u"\u00C6", "Aacute": u"\u00C1", "Acirc": u"\u00C2", "Agrave": u"\u00C0", "Alpha": u"\u0391", "Aring": u"\u00C5", "Atilde": u"\u00C3", "Auml": u"\u00C4", "Beta": u"\u0392", "Ccedil": u"\u00C7", "Chi": u"\u03A7", "Dagger": u"\u2021", "Delta": u"\u0394", "ETH": u"\u00D0", "Eacute": u"\u00C9", "Ecirc": u"\u00CA", "Egrave": u"\u00C8", "Epsilon": u"\u0395", "Eta": u"\u0397", "Euml": u"\u00CB", "Gamma": u"\u0393", "Iacute": u"\u00CD", "Icirc": u"\u00CE", "Igrave": u"\u00CC", "Iota": u"\u0399", "Iuml": u"\u00CF", "Kappa": u"\u039A", "Lambda": u"\u039B", "Mu": u"\u039C", "Ntilde": u"\u00D1", "Nu": u"\u039D", "OElig": u"\u0152", "Oacute": u"\u00D3", "Ocirc": u"\u00D4", "Ograve": u"\u00D2", "Omega": u"\u03A9", "Omicron": u"\u039F", "Oslash": u"\u00D8", "Otilde": u"\u00D5", "Ouml": u"\u00D6", "Phi": u"\u03A6", "Pi": u"\u03A0", "Prime": u"\u2033", "Psi": u"\u03A8", "Rho": u"\u03A1", "Scaron": u"\u0160", "Sigma": u"\u03A3", "THORN": u"\u00DE", "Tau": u"\u03A4", "Theta": u"\u0398", "Uacute": u"\u00DA", "Ucirc": u"\u00DB", "Ugrave": u"\u00D9", "Upsilon": u"\u03A5", "Uuml": u"\u00DC", "Xi": u"\u039E", "Yacute": u"\u00DD", "Yuml": u"\u0178", "Zeta": u"\u0396", "aacute": u"\u00E1", "acirc": u"\u00E2", "acute": u"\u00B4", "aelig": u"\u00E6", "agrave": u"\u00E0", "alefsym": u"\u2135", "alpha": u"\u03B1", "amp": u"\u0026", "AMP": u"\u0026", "and": u"\u2227", "ang": u"\u2220", "apos": u"\u0027", "aring": u"\u00E5", "asymp": u"\u2248", "atilde": u"\u00E3", "auml": u"\u00E4", "bdquo": u"\u201E", "beta": u"\u03B2", "brvbar": u"\u00A6", "bull": u"\u2022", "cap": u"\u2229", "ccedil": u"\u00E7", "cedil": u"\u00B8", "cent": u"\u00A2", "chi": u"\u03C7", "circ": u"\u02C6", "clubs": u"\u2663", "cong": u"\u2245", "copy": u"\u00A9", "COPY": u"\u00A9", "crarr": u"\u21B5", "cup": u"\u222A", "curren": u"\u00A4", "dArr": u"\u21D3", "dagger": u"\u2020", "darr": u"\u2193", "deg": u"\u00B0", "delta": u"\u03B4", "diams": u"\u2666", "divide": u"\u00F7", "eacute": u"\u00E9", "ecirc": u"\u00EA", "egrave": u"\u00E8", "empty": u"\u2205", "emsp": u"\u2003", "ensp": u"\u2002", "epsilon": u"\u03B5", "equiv": u"\u2261", "eta": u"\u03B7", "eth": u"\u00F0", "euml": u"\u00EB", "euro": u"\u20AC", "exist": u"\u2203", "fnof": u"\u0192", "forall": u"\u2200", "frac12": u"\u00BD", "frac14": u"\u00BC", "frac34": u"\u00BE", "frasl": u"\u2044", "gamma": u"\u03B3", "ge": u"\u2265", "gt": u"\u003E", "GT": u"\u003E", "hArr": u"\u21D4", "harr": u"\u2194", "hearts": u"\u2665", "hellip": u"\u2026", "iacute": u"\u00ED", "icirc": u"\u00EE", "iexcl": u"\u00A1", "igrave": u"\u00EC", "image": u"\u2111", "infin": u"\u221E", "int": u"\u222B", "iota": u"\u03B9", "iquest": u"\u00BF", "isin": u"\u2208", "iuml": u"\u00EF", "kappa": u"\u03BA", "lArr": u"\u21D0", "lambda": u"\u03BB", "lang": u"\u2329", "laquo": u"\u00AB", "larr": u"\u2190", "lceil": u"\u2308", "ldquo": u"\u201C", "le": u"\u2264", "lfloor": u"\u230A", "lowast": u"\u2217", "loz": u"\u25CA", "lrm": u"\u200E", "lsaquo": u"\u2039", "lsquo": u"\u2018", "lt": u"\u003C", "LT": u"\u003C", "macr": u"\u00AF", "mdash": u"\u2014", "micro": u"\u00B5", "middot": u"\u00B7", "minus": u"\u2212", "mu": u"\u03BC", "nabla": u"\u2207", "nbsp": u"\u00A0", "ndash": u"\u2013", "ne": u"\u2260", "ni": u"\u220B", "not": u"\u00AC", "notin": u"\u2209", "nsub": u"\u2284", "ntilde": u"\u00F1", "nu": u"\u03BD", "oacute": u"\u00F3", "ocirc": u"\u00F4", "oelig": u"\u0153", "ograve": u"\u00F2", "oline": u"\u203E", "omega": u"\u03C9", "omicron": u"\u03BF", "oplus": u"\u2295", "or": u"\u2228", "ordf": u"\u00AA", "ordm": u"\u00BA", "oslash": u"\u00F8", "otilde": u"\u00F5", "otimes": u"\u2297", "ouml": u"\u00F6", "para": u"\u00B6", "part": u"\u2202", "permil": u"\u2030", "perp": u"\u22A5", "phi": u"\u03C6", "pi": u"\u03C0", "piv": u"\u03D6", "plusmn": u"\u00B1", "pound": u"\u00A3", "prime": u"\u2032", "prod": u"\u220F", "prop": u"\u221D", "psi": u"\u03C8", "quot": u"\u0022", "QUOT": u"\u0022", "rArr": u"\u21D2", "radic": u"\u221A", "rang": u"\u232A", "raquo": u"\u00BB", "rarr": u"\u2192", "rceil": u"\u2309", "rdquo": u"\u201D", "real": u"\u211C", "reg": u"\u00AE", "REG": u"\u00AE", "rfloor": u"\u230B", "rho": u"\u03C1", "rlm": u"\u200F", "rsaquo": u"\u203A", "rsquo": u"\u2019", "sbquo": u"\u201A", "scaron": u"\u0161", "sdot": u"\u22C5", "sect": u"\u00A7", "shy": u"\u00AD", "sigma": u"\u03C3", "sigmaf": u"\u03C2", "sim": u"\u223C", "spades": u"\u2660", "sub": u"\u2282", "sube": u"\u2286", "sum": u"\u2211", "sup": u"\u2283", "sup1": u"\u00B9", "sup2": u"\u00B2", "sup3": u"\u00B3", "supe": u"\u2287", "szlig": u"\u00DF", "tau": u"\u03C4", "there4": u"\u2234", "theta": u"\u03B8", "thetasym": u"\u03D1", "thinsp": u"\u2009", "thorn": u"\u00FE", "tilde": u"\u02DC", "times": u"\u00D7", "trade": u"\u2122", "uArr": u"\u21D1", "uacute": u"\u00FA", "uarr": u"\u2191", "ucirc": u"\u00FB", "ugrave": u"\u00F9", "uml": u"\u00A8", "upsih": u"\u03D2", "upsilon": u"\u03C5", "uuml": u"\u00FC", "weierp": u"\u2118", "xi": u"\u03BE", "yacute": u"\u00FD", "yen": u"\u00A5", "yuml": u"\u00FF", "zeta": u"\u03B6", "zwj": u"\u200D", "zwnj": u"\u200C" } encodings = frozenset(( "ansi_x3.4-1968", "iso-ir-6", "ansi_x3.4-1986", "iso_646.irv:1991", "ascii", "iso646-us", "us-ascii", "us", "ibm367", "cp367", "csascii", "ks_c_5601-1987", "korean", "iso-2022-kr", "csiso2022kr", "euc-kr", "iso-2022-jp", "csiso2022jp", "iso-2022-jp-2", "iso-ir-58", "chinese", "csiso58gb231280", "iso_8859-1:1987", "iso-ir-100", "iso_8859-1", "iso-8859-1", "latin1", "l1", "ibm819", "cp819", "csisolatin1", "iso_8859-2:1987", "iso-ir-101", "iso_8859-2", "iso-8859-2", "latin2", "l2", "csisolatin2", "iso_8859-3:1988", "iso-ir-109", "iso_8859-3", "iso-8859-3", "latin3", "l3", "csisolatin3", "iso_8859-4:1988", "iso-ir-110", "iso_8859-4", "iso-8859-4", "latin4", "l4", "csisolatin4", "iso_8859-6:1987", "iso-ir-127", "iso_8859-6", "iso-8859-6", "ecma-114", "asmo-708", "arabic", "csisolatinarabic", "iso_8859-7:1987", "iso-ir-126", "iso_8859-7", "iso-8859-7", "elot_928", "ecma-118", "greek", "greek8", "csisolatingreek", "iso_8859-8:1988", "iso-ir-138", "iso_8859-8", "iso-8859-8", "hebrew", "csisolatinhebrew", "iso_8859-5:1988", "iso-ir-144", "iso_8859-5", "iso-8859-5", "cyrillic", "csisolatincyrillic", "iso_8859-9:1989", "iso-ir-148", "iso_8859-9", "iso-8859-9", "latin5", "l5", "csisolatin5", "iso-8859-10", "iso-ir-157", "l6", "iso_8859-10:1992", "csisolatin6", "latin6", "hp-roman8", "roman8", "r8", "ibm037", "cp037", "csibm037", "ibm424", "cp424", "csibm424", "ibm437", "cp437", "437", "cspc8codepage437", "ibm500", "cp500", "csibm500", "ibm775", "cp775", "cspc775baltic", "ibm850", "cp850", "850", "cspc850multilingual", "ibm852", "cp852", "852", "cspcp852", "ibm855", "cp855", "855", "csibm855", "ibm857", "cp857", "857", "csibm857", "ibm860", "cp860", "860", "csibm860", "ibm861", "cp861", "861", "cp-is", "csibm861", "ibm862", "cp862", "862", "cspc862latinhebrew", "ibm863", "cp863", "863", "csibm863", "ibm864", "cp864", "csibm864", "ibm865", "cp865", "865", "csibm865", "ibm866", "cp866", "866", "csibm866", "ibm869", "cp869", "869", "cp-gr", "csibm869", "ibm1026", "cp1026", "csibm1026", "koi8-r", "cskoi8r", "koi8-u", "big5-hkscs", "ptcp154", "csptcp154", "pt154", "cp154", "utf-7", "utf-16be", "utf-16le", "utf-16", "utf-8", "iso-8859-13", "iso-8859-14", "iso-ir-199", "iso_8859-14:1998", "iso_8859-14", "latin8", "iso-celtic", "l8", "iso-8859-15", "iso_8859-15", "iso-8859-16", "iso-ir-226", "iso_8859-16:2001", "iso_8859-16", "latin10", "l10", "gbk", "cp936", "ms936", "gb18030", "shift_jis", "ms_kanji", "csshiftjis", "euc-jp", "gb2312", "big5", "csbig5", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "tis-620", "hz-gb-2312", ))