parser.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. """A parser for HTML and XHTML."""
  2. # This file is based on sgmllib.py, but the API is slightly different.
  3. # XXX There should be a way to distinguish between PCDATA (parsed
  4. # character data -- the normal case), RCDATA (replaceable character
  5. # data -- only char and entity references and end tags are special)
  6. # and CDATA (character data -- only end tags are special).
  7. import re
  8. import warnings
  9. import _markupbase
  10. from html import unescape
  11. __all__ = ['HTMLParser']
  12. # Regular expressions used for parsing
  13. interesting_normal = re.compile('[&<]')
  14. incomplete = re.compile('&[a-zA-Z#]')
  15. entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  16. charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  17. starttagopen = re.compile('<[a-zA-Z]')
  18. piclose = re.compile('>')
  19. commentclose = re.compile(r'--\s*>')
  20. # Note:
  21. # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
  22. # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
  23. # explode, so don't do it.
  24. # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
  25. # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
  26. tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
  27. attrfind_tolerant = re.compile(
  28. r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
  29. r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
  30. locatestarttagend_tolerant = re.compile(r"""
  31. <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
  32. (?:[\s/]* # optional whitespace before attribute name
  33. (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
  34. (?:\s*=+\s* # value indicator
  35. (?:'[^']*' # LITA-enclosed value
  36. |"[^"]*" # LIT-enclosed value
  37. |(?!['"])[^>\s]* # bare value
  38. )
  39. (?:\s*,)* # possibly followed by a comma
  40. )?(?:\s|/(?!>))*
  41. )*
  42. )?
  43. \s* # trailing whitespace
  44. """, re.VERBOSE)
  45. endendtag = re.compile('>')
  46. # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
  47. # </ and the tag name, so maybe this should be fixed
  48. endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  49. class HTMLParser(_markupbase.ParserBase):
  50. """Find tags and other markup and call handler functions.
  51. Usage:
  52. p = HTMLParser()
  53. p.feed(data)
  54. ...
  55. p.close()
  56. Start tags are handled by calling self.handle_starttag() or
  57. self.handle_startendtag(); end tags by self.handle_endtag(). The
  58. data between tags is passed from the parser to the derived class
  59. by calling self.handle_data() with the data as argument (the data
  60. may be split up in arbitrary chunks). If convert_charrefs is
  61. True the character references are converted automatically to the
  62. corresponding Unicode character (and self.handle_data() is no
  63. longer split in chunks), otherwise they are passed by calling
  64. self.handle_entityref() or self.handle_charref() with the string
  65. containing respectively the named or numeric reference as the
  66. argument.
  67. """
  68. CDATA_CONTENT_ELEMENTS = ("script", "style")
  69. def __init__(self, *, convert_charrefs=True):
  70. """Initialize and reset this instance.
  71. If convert_charrefs is True (the default), all character references
  72. are automatically converted to the corresponding Unicode characters.
  73. """
  74. self.convert_charrefs = convert_charrefs
  75. self.reset()
  76. def reset(self):
  77. """Reset this instance. Loses all unprocessed data."""
  78. self.rawdata = ''
  79. self.lasttag = '???'
  80. self.interesting = interesting_normal
  81. self.cdata_elem = None
  82. _markupbase.ParserBase.reset(self)
  83. def feed(self, data):
  84. r"""Feed data to the parser.
  85. Call this as often as you want, with as little or as much text
  86. as you want (may include '\n').
  87. """
  88. self.rawdata = self.rawdata + data
  89. self.goahead(0)
  90. def close(self):
  91. """Handle any buffered data."""
  92. self.goahead(1)
  93. __starttag_text = None
  94. def get_starttag_text(self):
  95. """Return full source of start tag: '<...>'."""
  96. return self.__starttag_text
  97. def set_cdata_mode(self, elem):
  98. self.cdata_elem = elem.lower()
  99. self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
  100. def clear_cdata_mode(self):
  101. self.interesting = interesting_normal
  102. self.cdata_elem = None
  103. # Internal -- handle data as far as reasonable. May leave state
  104. # and data to be processed by a subsequent call. If 'end' is
  105. # true, force handling all data as if followed by EOF marker.
  106. def goahead(self, end):
  107. rawdata = self.rawdata
  108. i = 0
  109. n = len(rawdata)
  110. while i < n:
  111. if self.convert_charrefs and not self.cdata_elem:
  112. j = rawdata.find('<', i)
  113. if j < 0:
  114. # if we can't find the next <, either we are at the end
  115. # or there's more text incoming. If the latter is True,
  116. # we can't pass the text to handle_data in case we have
  117. # a charref cut in half at end. Try to determine if
  118. # this is the case before proceeding by looking for an
  119. # & near the end and see if it's followed by a space or ;.
  120. amppos = rawdata.rfind('&', max(i, n-34))
  121. if (amppos >= 0 and
  122. not re.compile(r'[\s;]').search(rawdata, amppos)):
  123. break # wait till we get all the text
  124. j = n
  125. else:
  126. match = self.interesting.search(rawdata, i) # < or &
  127. if match:
  128. j = match.start()
  129. else:
  130. if self.cdata_elem:
  131. break
  132. j = n
  133. if i < j:
  134. if self.convert_charrefs and not self.cdata_elem:
  135. self.handle_data(unescape(rawdata[i:j]))
  136. else:
  137. self.handle_data(rawdata[i:j])
  138. i = self.updatepos(i, j)
  139. if i == n: break
  140. startswith = rawdata.startswith
  141. if startswith('<', i):
  142. if starttagopen.match(rawdata, i): # < + letter
  143. k = self.parse_starttag(i)
  144. elif startswith("</", i):
  145. k = self.parse_endtag(i)
  146. elif startswith("<!--", i):
  147. k = self.parse_comment(i)
  148. elif startswith("<?", i):
  149. k = self.parse_pi(i)
  150. elif startswith("<!", i):
  151. k = self.parse_html_declaration(i)
  152. elif (i + 1) < n:
  153. self.handle_data("<")
  154. k = i + 1
  155. else:
  156. break
  157. if k < 0:
  158. if not end:
  159. break
  160. k = rawdata.find('>', i + 1)
  161. if k < 0:
  162. k = rawdata.find('<', i + 1)
  163. if k < 0:
  164. k = i + 1
  165. else:
  166. k += 1
  167. if self.convert_charrefs and not self.cdata_elem:
  168. self.handle_data(unescape(rawdata[i:k]))
  169. else:
  170. self.handle_data(rawdata[i:k])
  171. i = self.updatepos(i, k)
  172. elif startswith("&#", i):
  173. match = charref.match(rawdata, i)
  174. if match:
  175. name = match.group()[2:-1]
  176. self.handle_charref(name)
  177. k = match.end()
  178. if not startswith(';', k-1):
  179. k = k - 1
  180. i = self.updatepos(i, k)
  181. continue
  182. else:
  183. if ";" in rawdata[i:]: # bail by consuming &#
  184. self.handle_data(rawdata[i:i+2])
  185. i = self.updatepos(i, i+2)
  186. break
  187. elif startswith('&', i):
  188. match = entityref.match(rawdata, i)
  189. if match:
  190. name = match.group(1)
  191. self.handle_entityref(name)
  192. k = match.end()
  193. if not startswith(';', k-1):
  194. k = k - 1
  195. i = self.updatepos(i, k)
  196. continue
  197. match = incomplete.match(rawdata, i)
  198. if match:
  199. # match.group() will contain at least 2 chars
  200. if end and match.group() == rawdata[i:]:
  201. k = match.end()
  202. if k <= i:
  203. k = n
  204. i = self.updatepos(i, i + 1)
  205. # incomplete
  206. break
  207. elif (i + 1) < n:
  208. # not the end of the buffer, and can't be confused
  209. # with some other construct
  210. self.handle_data("&")
  211. i = self.updatepos(i, i + 1)
  212. else:
  213. break
  214. else:
  215. assert 0, "interesting.search() lied"
  216. # end while
  217. if end and i < n and not self.cdata_elem:
  218. if self.convert_charrefs and not self.cdata_elem:
  219. self.handle_data(unescape(rawdata[i:n]))
  220. else:
  221. self.handle_data(rawdata[i:n])
  222. i = self.updatepos(i, n)
  223. self.rawdata = rawdata[i:]
  224. # Internal -- parse html declarations, return length or -1 if not terminated
  225. # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
  226. # See also parse_declaration in _markupbase
  227. def parse_html_declaration(self, i):
  228. rawdata = self.rawdata
  229. assert rawdata[i:i+2] == '<!', ('unexpected call to '
  230. 'parse_html_declaration()')
  231. if rawdata[i:i+4] == '<!--':
  232. # this case is actually already handled in goahead()
  233. return self.parse_comment(i)
  234. elif rawdata[i:i+3] == '<![':
  235. return self.parse_marked_section(i)
  236. elif rawdata[i:i+9].lower() == '<!doctype':
  237. # find the closing >
  238. gtpos = rawdata.find('>', i+9)
  239. if gtpos == -1:
  240. return -1
  241. self.handle_decl(rawdata[i+2:gtpos])
  242. return gtpos+1
  243. else:
  244. return self.parse_bogus_comment(i)
  245. # Internal -- parse bogus comment, return length or -1 if not terminated
  246. # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
  247. def parse_bogus_comment(self, i, report=1):
  248. rawdata = self.rawdata
  249. assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
  250. 'parse_comment()')
  251. pos = rawdata.find('>', i+2)
  252. if pos == -1:
  253. return -1
  254. if report:
  255. self.handle_comment(rawdata[i+2:pos])
  256. return pos + 1
  257. # Internal -- parse processing instr, return end or -1 if not terminated
  258. def parse_pi(self, i):
  259. rawdata = self.rawdata
  260. assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
  261. match = piclose.search(rawdata, i+2) # >
  262. if not match:
  263. return -1
  264. j = match.start()
  265. self.handle_pi(rawdata[i+2: j])
  266. j = match.end()
  267. return j
  268. # Internal -- handle starttag, return end or -1 if not terminated
  269. def parse_starttag(self, i):
  270. self.__starttag_text = None
  271. endpos = self.check_for_whole_start_tag(i)
  272. if endpos < 0:
  273. return endpos
  274. rawdata = self.rawdata
  275. self.__starttag_text = rawdata[i:endpos]
  276. # Now parse the data between i+1 and j into a tag and attrs
  277. attrs = []
  278. match = tagfind_tolerant.match(rawdata, i+1)
  279. assert match, 'unexpected call to parse_starttag()'
  280. k = match.end()
  281. self.lasttag = tag = match.group(1).lower()
  282. while k < endpos:
  283. m = attrfind_tolerant.match(rawdata, k)
  284. if not m:
  285. break
  286. attrname, rest, attrvalue = m.group(1, 2, 3)
  287. if not rest:
  288. attrvalue = None
  289. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  290. attrvalue[:1] == '"' == attrvalue[-1:]:
  291. attrvalue = attrvalue[1:-1]
  292. if attrvalue:
  293. attrvalue = unescape(attrvalue)
  294. attrs.append((attrname.lower(), attrvalue))
  295. k = m.end()
  296. end = rawdata[k:endpos].strip()
  297. if end not in (">", "/>"):
  298. lineno, offset = self.getpos()
  299. if "\n" in self.__starttag_text:
  300. lineno = lineno + self.__starttag_text.count("\n")
  301. offset = len(self.__starttag_text) \
  302. - self.__starttag_text.rfind("\n")
  303. else:
  304. offset = offset + len(self.__starttag_text)
  305. self.handle_data(rawdata[i:endpos])
  306. return endpos
  307. if end.endswith('/>'):
  308. # XHTML-style empty tag: <span attr="value" />
  309. self.handle_startendtag(tag, attrs)
  310. else:
  311. self.handle_starttag(tag, attrs)
  312. if tag in self.CDATA_CONTENT_ELEMENTS:
  313. self.set_cdata_mode(tag)
  314. return endpos
  315. # Internal -- check to see if we have a complete starttag; return end
  316. # or -1 if incomplete.
  317. def check_for_whole_start_tag(self, i):
  318. rawdata = self.rawdata
  319. m = locatestarttagend_tolerant.match(rawdata, i)
  320. if m:
  321. j = m.end()
  322. next = rawdata[j:j+1]
  323. if next == ">":
  324. return j + 1
  325. if next == "/":
  326. if rawdata.startswith("/>", j):
  327. return j + 2
  328. if rawdata.startswith("/", j):
  329. # buffer boundary
  330. return -1
  331. # else bogus input
  332. if j > i:
  333. return j
  334. else:
  335. return i + 1
  336. if next == "":
  337. # end of input
  338. return -1
  339. if next in ("abcdefghijklmnopqrstuvwxyz=/"
  340. "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
  341. # end of input in or before attribute value, or we have the
  342. # '/' from a '/>' ending
  343. return -1
  344. if j > i:
  345. return j
  346. else:
  347. return i + 1
  348. raise AssertionError("we should not get here!")
  349. # Internal -- parse endtag, return end or -1 if incomplete
  350. def parse_endtag(self, i):
  351. rawdata = self.rawdata
  352. assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
  353. match = endendtag.search(rawdata, i+1) # >
  354. if not match:
  355. return -1
  356. gtpos = match.end()
  357. match = endtagfind.match(rawdata, i) # </ + tag + >
  358. if not match:
  359. if self.cdata_elem is not None:
  360. self.handle_data(rawdata[i:gtpos])
  361. return gtpos
  362. # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
  363. namematch = tagfind_tolerant.match(rawdata, i+2)
  364. if not namematch:
  365. # w3.org/TR/html5/tokenization.html#end-tag-open-state
  366. if rawdata[i:i+3] == '</>':
  367. return i+3
  368. else:
  369. return self.parse_bogus_comment(i)
  370. tagname = namematch.group(1).lower()
  371. # consume and ignore other stuff between the name and the >
  372. # Note: this is not 100% correct, since we might have things like
  373. # </tag attr=">">, but looking for > after tha name should cover
  374. # most of the cases and is much simpler
  375. gtpos = rawdata.find('>', namematch.end())
  376. self.handle_endtag(tagname)
  377. return gtpos+1
  378. elem = match.group(1).lower() # script or style
  379. if self.cdata_elem is not None:
  380. if elem != self.cdata_elem:
  381. self.handle_data(rawdata[i:gtpos])
  382. return gtpos
  383. self.handle_endtag(elem.lower())
  384. self.clear_cdata_mode()
  385. return gtpos
  386. # Overridable -- finish processing of start+end tag: <tag.../>
  387. def handle_startendtag(self, tag, attrs):
  388. self.handle_starttag(tag, attrs)
  389. self.handle_endtag(tag)
  390. # Overridable -- handle start tag
  391. def handle_starttag(self, tag, attrs):
  392. pass
  393. # Overridable -- handle end tag
  394. def handle_endtag(self, tag):
  395. pass
  396. # Overridable -- handle character reference
  397. def handle_charref(self, name):
  398. pass
  399. # Overridable -- handle entity reference
  400. def handle_entityref(self, name):
  401. pass
  402. # Overridable -- handle data
  403. def handle_data(self, data):
  404. pass
  405. # Overridable -- handle comment
  406. def handle_comment(self, data):
  407. pass
  408. # Overridable -- handle declaration
  409. def handle_decl(self, decl):
  410. pass
  411. # Overridable -- handle processing instruction
  412. def handle_pi(self, data):
  413. pass
  414. def unknown_decl(self, data):
  415. pass
  416. # Internal -- helper to remove special character quoting
  417. def unescape(self, s):
  418. warnings.warn('The unescape method is deprecated and will be removed '
  419. 'in 3.5, use html.unescape() instead.',
  420. DeprecationWarning, stacklevel=2)
  421. return unescape(s)