test_htmlparser.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. """Tests for HTMLParser.py."""
  2. import HTMLParser
  3. import pprint
  4. import unittest
  5. from test import test_support
  6. class EventCollector(HTMLParser.HTMLParser):
  7. def __init__(self):
  8. self.events = []
  9. self.append = self.events.append
  10. HTMLParser.HTMLParser.__init__(self)
  11. def get_events(self):
  12. # Normalize the list of events so that buffer artefacts don't
  13. # separate runs of contiguous characters.
  14. L = []
  15. prevtype = None
  16. for event in self.events:
  17. type = event[0]
  18. if type == prevtype == "data":
  19. L[-1] = ("data", L[-1][1] + event[1])
  20. else:
  21. L.append(event)
  22. prevtype = type
  23. self.events = L
  24. return L
  25. # structure markup
  26. def handle_starttag(self, tag, attrs):
  27. self.append(("starttag", tag, attrs))
  28. def handle_startendtag(self, tag, attrs):
  29. self.append(("startendtag", tag, attrs))
  30. def handle_endtag(self, tag):
  31. self.append(("endtag", tag))
  32. # all other markup
  33. def handle_comment(self, data):
  34. self.append(("comment", data))
  35. def handle_charref(self, data):
  36. self.append(("charref", data))
  37. def handle_data(self, data):
  38. self.append(("data", data))
  39. def handle_decl(self, data):
  40. self.append(("decl", data))
  41. def handle_entityref(self, data):
  42. self.append(("entityref", data))
  43. def handle_pi(self, data):
  44. self.append(("pi", data))
  45. def unknown_decl(self, decl):
  46. self.append(("unknown decl", decl))
  47. class EventCollectorExtra(EventCollector):
  48. def handle_starttag(self, tag, attrs):
  49. EventCollector.handle_starttag(self, tag, attrs)
  50. self.append(("starttag_text", self.get_starttag_text()))
  51. class TestCaseBase(unittest.TestCase):
  52. def _run_check(self, source, expected_events, collector=EventCollector):
  53. parser = collector()
  54. for s in source:
  55. parser.feed(s)
  56. parser.close()
  57. events = parser.get_events()
  58. if events != expected_events:
  59. self.fail("received events did not match expected events\n"
  60. "Expected:\n" + pprint.pformat(expected_events) +
  61. "\nReceived:\n" + pprint.pformat(events))
  62. def _run_check_extra(self, source, events):
  63. self._run_check(source, events, EventCollectorExtra)
  64. def _parse_error(self, source):
  65. def parse(source=source):
  66. parser = HTMLParser.HTMLParser()
  67. parser.feed(source)
  68. parser.close()
  69. self.assertRaises(HTMLParser.HTMLParseError, parse)
  70. class HTMLParserTestCase(TestCaseBase):
  71. def test_processing_instruction_only(self):
  72. self._run_check("<?processing instruction>", [
  73. ("pi", "processing instruction"),
  74. ])
  75. self._run_check("<?processing instruction ?>", [
  76. ("pi", "processing instruction ?"),
  77. ])
  78. def test_simple_html(self):
  79. self._run_check("""
  80. <!DOCTYPE html PUBLIC 'foo'>
  81. <HTML>&entity;&#32;
  82. <!--comment1a
  83. -></foo><bar>&lt;<?pi?></foo<bar
  84. comment1b-->
  85. <Img sRc='Bar' isMAP>sample
  86. text
  87. &#x201C;
  88. <!--comment2a-- --comment2b-->
  89. </Html>
  90. """, [
  91. ("data", "\n"),
  92. ("decl", "DOCTYPE html PUBLIC 'foo'"),
  93. ("data", "\n"),
  94. ("starttag", "html", []),
  95. ("entityref", "entity"),
  96. ("charref", "32"),
  97. ("data", "\n"),
  98. ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
  99. ("data", "\n"),
  100. ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
  101. ("data", "sample\ntext\n"),
  102. ("charref", "x201C"),
  103. ("data", "\n"),
  104. ("comment", "comment2a-- --comment2b"),
  105. ("data", "\n"),
  106. ("endtag", "html"),
  107. ("data", "\n"),
  108. ])
  109. def test_unclosed_entityref(self):
  110. self._run_check("&entityref foo", [
  111. ("entityref", "entityref"),
  112. ("data", " foo"),
  113. ])
  114. def test_bad_nesting(self):
  115. # Strangely, this *is* supposed to test that overlapping
  116. # elements are allowed. HTMLParser is more geared toward
  117. # lexing the input that parsing the structure.
  118. self._run_check("<a><b></a></b>", [
  119. ("starttag", "a", []),
  120. ("starttag", "b", []),
  121. ("endtag", "a"),
  122. ("endtag", "b"),
  123. ])
  124. def test_bare_ampersands(self):
  125. self._run_check("this text & contains & ampersands &", [
  126. ("data", "this text & contains & ampersands &"),
  127. ])
  128. def test_bare_pointy_brackets(self):
  129. self._run_check("this < text > contains < bare>pointy< brackets", [
  130. ("data", "this < text > contains < bare>pointy< brackets"),
  131. ])
  132. def test_illegal_declarations(self):
  133. self._run_check('<!spacer type="block" height="25">',
  134. [('comment', 'spacer type="block" height="25"')])
  135. def test_starttag_end_boundary(self):
  136. self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
  137. self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
  138. def test_buffer_artefacts(self):
  139. output = [("starttag", "a", [("b", "<")])]
  140. self._run_check(["<a b='<'>"], output)
  141. self._run_check(["<a ", "b='<'>"], output)
  142. self._run_check(["<a b", "='<'>"], output)
  143. self._run_check(["<a b=", "'<'>"], output)
  144. self._run_check(["<a b='<", "'>"], output)
  145. self._run_check(["<a b='<'", ">"], output)
  146. output = [("starttag", "a", [("b", ">")])]
  147. self._run_check(["<a b='>'>"], output)
  148. self._run_check(["<a ", "b='>'>"], output)
  149. self._run_check(["<a b", "='>'>"], output)
  150. self._run_check(["<a b=", "'>'>"], output)
  151. self._run_check(["<a b='>", "'>"], output)
  152. self._run_check(["<a b='>'", ">"], output)
  153. output = [("comment", "abc")]
  154. self._run_check(["", "<!--abc-->"], output)
  155. self._run_check(["<", "!--abc-->"], output)
  156. self._run_check(["<!", "--abc-->"], output)
  157. self._run_check(["<!-", "-abc-->"], output)
  158. self._run_check(["<!--", "abc-->"], output)
  159. self._run_check(["<!--a", "bc-->"], output)
  160. self._run_check(["<!--ab", "c-->"], output)
  161. self._run_check(["<!--abc", "-->"], output)
  162. self._run_check(["<!--abc-", "->"], output)
  163. self._run_check(["<!--abc--", ">"], output)
  164. self._run_check(["<!--abc-->", ""], output)
  165. def test_starttag_junk_chars(self):
  166. self._run_check("</>", [])
  167. self._run_check("</$>", [('comment', '$')])
  168. self._run_check("</", [('data', '</')])
  169. self._run_check("</a", [('data', '</a')])
  170. self._run_check("<a<a>", [('starttag', 'a<a', [])])
  171. self._run_check("</a<a>", [('endtag', 'a<a')])
  172. self._run_check("<!", [('data', '<!')])
  173. self._run_check("<a", [('data', '<a')])
  174. self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
  175. self._run_check("<a foo='bar", [('data', "<a foo='bar")])
  176. self._run_check("<a foo='>'", [('data', "<a foo='>'")])
  177. self._run_check("<a foo='>", [('data', "<a foo='>")])
  178. self._run_check("<a$>", [('starttag', 'a$', [])])
  179. self._run_check("<a$b>", [('starttag', 'a$b', [])])
  180. self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
  181. self._run_check("<a$b >", [('starttag', 'a$b', [])])
  182. self._run_check("<a$b />", [('startendtag', 'a$b', [])])
  183. def test_valid_doctypes(self):
  184. # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
  185. dtds = ['HTML', # HTML5 doctype
  186. ('HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
  187. '"http://www.w3.org/TR/html4/strict.dtd"'),
  188. ('HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" '
  189. '"http://www.w3.org/TR/html4/loose.dtd"'),
  190. ('html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '
  191. '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"'),
  192. ('html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" '
  193. '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"'),
  194. ('math PUBLIC "-//W3C//DTD MathML 2.0//EN" '
  195. '"http://www.w3.org/Math/DTD/mathml2/mathml2.dtd"'),
  196. ('html PUBLIC "-//W3C//DTD '
  197. 'XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN" '
  198. '"http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd"'),
  199. ('svg PUBLIC "-//W3C//DTD SVG 1.1//EN" '
  200. '"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"'),
  201. 'html PUBLIC "-//IETF//DTD HTML 2.0//EN"',
  202. 'html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"']
  203. for dtd in dtds:
  204. self._run_check("<!DOCTYPE %s>" % dtd,
  205. [('decl', 'DOCTYPE ' + dtd)])
  206. def test_slashes_in_starttag(self):
  207. self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
  208. html = ('<img width=902 height=250px '
  209. 'src="/sites/default/files/images/homepage/foo.jpg" '
  210. '/*what am I doing here*/ />')
  211. expected = [(
  212. 'startendtag', 'img',
  213. [('width', '902'), ('height', '250px'),
  214. ('src', '/sites/default/files/images/homepage/foo.jpg'),
  215. ('*what', None), ('am', None), ('i', None),
  216. ('doing', None), ('here*', None)]
  217. )]
  218. self._run_check(html, expected)
  219. html = ('<a / /foo/ / /=/ / /bar/ / />'
  220. '<a / /foo/ / /=/ / /bar/ / >')
  221. expected = [
  222. ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
  223. ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
  224. ]
  225. self._run_check(html, expected)
  226. #see issue #14538
  227. html = ('<meta><meta / ><meta // ><meta / / >'
  228. '<meta/><meta /><meta //><meta//>')
  229. expected = [
  230. ('starttag', 'meta', []), ('starttag', 'meta', []),
  231. ('starttag', 'meta', []), ('starttag', 'meta', []),
  232. ('startendtag', 'meta', []), ('startendtag', 'meta', []),
  233. ('startendtag', 'meta', []), ('startendtag', 'meta', []),
  234. ]
  235. self._run_check(html, expected)
  236. def test_declaration_junk_chars(self):
  237. self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
  238. def test_startendtag(self):
  239. self._run_check("<p/>", [
  240. ("startendtag", "p", []),
  241. ])
  242. self._run_check("<p></p>", [
  243. ("starttag", "p", []),
  244. ("endtag", "p"),
  245. ])
  246. self._run_check("<p><img src='foo' /></p>", [
  247. ("starttag", "p", []),
  248. ("startendtag", "img", [("src", "foo")]),
  249. ("endtag", "p"),
  250. ])
  251. def test_invalid_end_tags(self):
  252. # A collection of broken end tags. <br> is used as separator.
  253. # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
  254. # and #13993
  255. html = ('<br></label</p><br></div end tmAd-leaderBoard><br></<h4><br>'
  256. '</li class="unit"><br></li\r\n\t\t\t\t\t\t</ul><br></><br>')
  257. expected = [('starttag', 'br', []),
  258. # < is part of the name, / is discarded, p is an attribute
  259. ('endtag', 'label<'),
  260. ('starttag', 'br', []),
  261. # text and attributes are discarded
  262. ('endtag', 'div'),
  263. ('starttag', 'br', []),
  264. # comment because the first char after </ is not a-zA-Z
  265. ('comment', '<h4'),
  266. ('starttag', 'br', []),
  267. # attributes are discarded
  268. ('endtag', 'li'),
  269. ('starttag', 'br', []),
  270. # everything till ul (included) is discarded
  271. ('endtag', 'li'),
  272. ('starttag', 'br', []),
  273. # </> is ignored
  274. ('starttag', 'br', [])]
  275. self._run_check(html, expected)
  276. def test_broken_invalid_end_tag(self):
  277. # This is technically wrong (the "> shouldn't be included in the 'data')
  278. # but is probably not worth fixing it (in addition to all the cases of
  279. # the previous test, it would require a full attribute parsing).
  280. # see #13993
  281. html = '<b>This</b attr=">"> confuses the parser'
  282. expected = [('starttag', 'b', []),
  283. ('data', 'This'),
  284. ('endtag', 'b'),
  285. ('data', '"> confuses the parser')]
  286. self._run_check(html, expected)
  287. def test_get_starttag_text(self):
  288. s = """<foo:bar \n one="1"\ttwo=2 >"""
  289. self._run_check_extra(s, [
  290. ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
  291. ("starttag_text", s)])
  292. def test_cdata_content(self):
  293. contents = [
  294. '<!-- not a comment --> &not-an-entity-ref;',
  295. "<not a='start tag'>",
  296. '<a href="" /> <p> <span></span>',
  297. 'foo = "</scr" + "ipt>";',
  298. 'foo = "</SCRIPT" + ">";',
  299. 'foo = <\n/script> ',
  300. '<!-- document.write("</scr" + "ipt>"); -->',
  301. ('\n//<![CDATA[\n'
  302. 'document.write(\'<s\'+\'cript type="text/javascript" '
  303. 'src="http://www.example.org/r=\'+new '
  304. 'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
  305. '\n<!-- //\nvar foo = 3.14;\n// -->\n',
  306. 'foo = "</sty" + "le>";',
  307. u'<!-- \u2603 -->',
  308. # these two should be invalid according to the HTML 5 spec,
  309. # section 8.1.2.2
  310. #'foo = </\nscript>',
  311. #'foo = </ script>',
  312. ]
  313. elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
  314. for content in contents:
  315. for element in elements:
  316. element_lower = element.lower()
  317. s = u'<{element}>{content}</{element}>'.format(element=element,
  318. content=content)
  319. self._run_check(s, [("starttag", element_lower, []),
  320. ("data", content),
  321. ("endtag", element_lower)])
  322. def test_cdata_with_closing_tags(self):
  323. # see issue #13358
  324. # make sure that HTMLParser calls handle_data only once for each CDATA.
  325. # The normal event collector normalizes the events in get_events,
  326. # so we override it to return the original list of events.
  327. class Collector(EventCollector):
  328. def get_events(self):
  329. return self.events
  330. content = """<!-- not a comment --> &not-an-entity-ref;
  331. <a href="" /> </p><p> &amp; <span></span></style>
  332. '</script' + '>' </html> </head> </scripter>!"""
  333. for element in [' script', 'script ', ' script ',
  334. '\nscript', 'script\n', '\nscript\n']:
  335. s = u'<script>{content}</{element}>'.format(element=element,
  336. content=content)
  337. self._run_check(s, [("starttag", "script", []),
  338. ("data", content),
  339. ("endtag", "script")],
  340. collector=Collector)
  341. def test_malformatted_charref(self):
  342. self._run_check("<p>&#bad;</p>", [
  343. ("starttag", "p", []),
  344. ("data", "&#bad;"),
  345. ("endtag", "p"),
  346. ])
  347. # add the [] as a workaround to avoid buffering (see #20288)
  348. self._run_check(["<div>&#bad;</div>"], [
  349. ("starttag", "div", []),
  350. ("data", "&#bad;"),
  351. ("endtag", "div"),
  352. ])
  353. def test_unescape_function(self):
  354. parser = HTMLParser.HTMLParser()
  355. self.assertEqual(parser.unescape('&#bad;'),'&#bad;')
  356. self.assertEqual(parser.unescape('&#0038;'),'&')
  357. class AttributesTestCase(TestCaseBase):
  358. def test_attr_syntax(self):
  359. output = [
  360. ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
  361. ]
  362. self._run_check("""<a b='v' c="v" d=v e>""", output)
  363. self._run_check("""<a b = 'v' c = "v" d = v e>""", output)
  364. self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
  365. self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
  366. def test_attr_values(self):
  367. self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
  368. [("starttag", "a", [("b", "xxx\n\txxx"),
  369. ("c", "yyy\t\nyyy"),
  370. ("d", "\txyz\n")])])
  371. self._run_check("""<a b='' c="">""",
  372. [("starttag", "a", [("b", ""), ("c", "")])])
  373. # Regression test for SF patch #669683.
  374. self._run_check("<e a=rgb(1,2,3)>",
  375. [("starttag", "e", [("a", "rgb(1,2,3)")])])
  376. # Regression test for SF bug #921657.
  377. self._run_check(
  378. "<a href=mailto:xyz@example.com>",
  379. [("starttag", "a", [("href", "mailto:xyz@example.com")])])
  380. def test_attr_nonascii(self):
  381. # see issue 7311
  382. self._run_check(
  383. u"<img src=/foo/bar.png alt=\u4e2d\u6587>",
  384. [("starttag", "img", [("src", "/foo/bar.png"),
  385. ("alt", u"\u4e2d\u6587")])])
  386. self._run_check(
  387. u"<a title='\u30c6\u30b9\u30c8' href='\u30c6\u30b9\u30c8.html'>",
  388. [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
  389. ("href", u"\u30c6\u30b9\u30c8.html")])])
  390. self._run_check(
  391. u'<a title="\u30c6\u30b9\u30c8" href="\u30c6\u30b9\u30c8.html">',
  392. [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
  393. ("href", u"\u30c6\u30b9\u30c8.html")])])
  394. def test_attr_entity_replacement(self):
  395. self._run_check(
  396. "<a b='&amp;&gt;&lt;&quot;&apos;'>",
  397. [("starttag", "a", [("b", "&><\"'")])])
  398. def test_attr_funky_names(self):
  399. self._run_check(
  400. "<a a.b='v' c:d=v e-f=v>",
  401. [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
  402. self._run_check(
  403. "<a $><b $=%><c \=/>",
  404. [("starttag", "a", [("$", None)]),
  405. ("starttag", "b", [("$", "%")]),
  406. ("starttag", "c", [("\\", "/")])])
  407. def test_entityrefs_in_attributes(self):
  408. self._run_check(
  409. "<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>",
  410. [("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])])
  411. def test_entities_in_attribute_value(self):
  412. # see #1200313
  413. for entity in ['&', '&amp;', '&#38;', '&#x26;']:
  414. self._run_check('<a href="%s">' % entity,
  415. [("starttag", "a", [("href", "&")])])
  416. self._run_check("<a href='%s'>" % entity,
  417. [("starttag", "a", [("href", "&")])])
  418. self._run_check("<a href=%s>" % entity,
  419. [("starttag", "a", [("href", "&")])])
  420. def test_malformed_attributes(self):
  421. # see #13357
  422. html = (
  423. "<a href=test'style='color:red;bad1'>test - bad1</a>"
  424. "<a href=test'+style='color:red;ba2'>test - bad2</a>"
  425. "<a href=test'&nbsp;style='color:red;bad3'>test - bad3</a>"
  426. "<a href = test'&nbsp;style='color:red;bad4' >test - bad4</a>"
  427. )
  428. expected = [
  429. ('starttag', 'a', [('href', "test'style='color:red;bad1'")]),
  430. ('data', 'test - bad1'), ('endtag', 'a'),
  431. ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]),
  432. ('data', 'test - bad2'), ('endtag', 'a'),
  433. ('starttag', 'a', [('href', u"test'\xa0style='color:red;bad3'")]),
  434. ('data', 'test - bad3'), ('endtag', 'a'),
  435. ('starttag', 'a', [('href', u"test'\xa0style='color:red;bad4'")]),
  436. ('data', 'test - bad4'), ('endtag', 'a')
  437. ]
  438. self._run_check(html, expected)
  439. def test_malformed_adjacent_attributes(self):
  440. # see #12629
  441. self._run_check('<x><y z=""o"" /></x>',
  442. [('starttag', 'x', []),
  443. ('startendtag', 'y', [('z', ''), ('o""', None)]),
  444. ('endtag', 'x')])
  445. self._run_check('<x><y z="""" /></x>',
  446. [('starttag', 'x', []),
  447. ('startendtag', 'y', [('z', ''), ('""', None)]),
  448. ('endtag', 'x')])
  449. # see #755670 for the following 3 tests
  450. def test_adjacent_attributes(self):
  451. self._run_check('<a width="100%"cellspacing=0>',
  452. [("starttag", "a",
  453. [("width", "100%"), ("cellspacing","0")])])
  454. self._run_check('<a id="foo"class="bar">',
  455. [("starttag", "a",
  456. [("id", "foo"), ("class","bar")])])
  457. def test_missing_attribute_value(self):
  458. self._run_check('<a v=>',
  459. [("starttag", "a", [("v", "")])])
  460. def test_javascript_attribute_value(self):
  461. self._run_check("<a href=javascript:popup('/popup/help.html')>",
  462. [("starttag", "a",
  463. [("href", "javascript:popup('/popup/help.html')")])])
  464. def test_end_tag_in_attribute_value(self):
  465. # see #1745761
  466. self._run_check("<a href='http://www.example.org/\">;'>spam</a>",
  467. [("starttag", "a",
  468. [("href", "http://www.example.org/\">;")]),
  469. ("data", "spam"), ("endtag", "a")])
  470. def test_comments(self):
  471. html = ("<!-- I'm a valid comment -->"
  472. '<!--me too!-->'
  473. '<!------>'
  474. '<!---->'
  475. '<!----I have many hyphens---->'
  476. '<!-- I have a > in the middle -->'
  477. '<!-- and I have -- in the middle! -->')
  478. expected = [('comment', " I'm a valid comment "),
  479. ('comment', 'me too!'),
  480. ('comment', '--'),
  481. ('comment', ''),
  482. ('comment', '--I have many hyphens--'),
  483. ('comment', ' I have a > in the middle '),
  484. ('comment', ' and I have -- in the middle! ')]
  485. self._run_check(html, expected)
  486. def test_broken_comments(self):
  487. html = ('<! not really a comment >'
  488. '<! not a comment either -->'
  489. '<! -- close enough -->'
  490. '<!><!<-- this was an empty comment>'
  491. '<!!! another bogus comment !!!>')
  492. expected = [
  493. ('comment', ' not really a comment '),
  494. ('comment', ' not a comment either --'),
  495. ('comment', ' -- close enough --'),
  496. ('comment', ''),
  497. ('comment', '<-- this was an empty comment'),
  498. ('comment', '!! another bogus comment !!!'),
  499. ]
  500. self._run_check(html, expected)
  501. def test_condcoms(self):
  502. html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
  503. '<!--[if IE 8]>condcoms<![endif]-->'
  504. '<!--[if lte IE 7]>pretty?<![endif]-->')
  505. expected = [('comment', "[if IE & !(lte IE 8)]>aren't<![endif]"),
  506. ('comment', '[if IE 8]>condcoms<![endif]'),
  507. ('comment', '[if lte IE 7]>pretty?<![endif]')]
  508. self._run_check(html, expected)
  509. def test_broken_condcoms(self):
  510. # these condcoms are missing the '--' after '<!' and before the '>'
  511. html = ('<![if !(IE)]>broken condcom<![endif]>'
  512. '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
  513. '<![if !IE 6]><img src="firefox.png" /><![endif]>'
  514. '<![if !ie 6]><b>foo</b><![endif]>'
  515. '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
  516. # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
  517. # and "8.2.4.45 Markup declaration open state", comment tokens should
  518. # be emitted instead of 'unknown decl', but calling unknown_decl
  519. # provides more flexibility.
  520. # See also Lib/_markupbase.py:parse_declaration
  521. expected = [
  522. ('unknown decl', 'if !(IE)'),
  523. ('data', 'broken condcom'),
  524. ('unknown decl', 'endif'),
  525. ('unknown decl', 'if ! IE'),
  526. ('startendtag', 'link', [('href', 'favicon.tiff')]),
  527. ('unknown decl', 'endif'),
  528. ('unknown decl', 'if !IE 6'),
  529. ('startendtag', 'img', [('src', 'firefox.png')]),
  530. ('unknown decl', 'endif'),
  531. ('unknown decl', 'if !ie 6'),
  532. ('starttag', 'b', []),
  533. ('data', 'foo'),
  534. ('endtag', 'b'),
  535. ('unknown decl', 'endif'),
  536. ('unknown decl', 'if (!IE)|(lt IE 9)'),
  537. ('startendtag', 'img', [('src', 'mammoth.bmp')]),
  538. ('unknown decl', 'endif')
  539. ]
  540. self._run_check(html, expected)
  541. def test_main():
  542. test_support.run_unittest(HTMLParserTestCase, AttributesTestCase)
  543. if __name__ == "__main__":
  544. test_main()