pulldom.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. import xml.sax
  2. import xml.sax.handler
  3. START_ELEMENT = "START_ELEMENT"
  4. END_ELEMENT = "END_ELEMENT"
  5. COMMENT = "COMMENT"
  6. START_DOCUMENT = "START_DOCUMENT"
  7. END_DOCUMENT = "END_DOCUMENT"
  8. PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
  9. IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
  10. CHARACTERS = "CHARACTERS"
  11. class PullDOM(xml.sax.ContentHandler):
  12. _locator = None
  13. document = None
  14. def __init__(self, documentFactory=None):
  15. from xml.dom import XML_NAMESPACE
  16. self.documentFactory = documentFactory
  17. self.firstEvent = [None, None]
  18. self.lastEvent = self.firstEvent
  19. self.elementStack = []
  20. self.push = self.elementStack.append
  21. try:
  22. self.pop = self.elementStack.pop
  23. except AttributeError:
  24. # use class' pop instead
  25. pass
  26. self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
  27. self._current_context = self._ns_contexts[-1]
  28. self.pending_events = []
  29. def pop(self):
  30. result = self.elementStack[-1]
  31. del self.elementStack[-1]
  32. return result
  33. def setDocumentLocator(self, locator):
  34. self._locator = locator
  35. def startPrefixMapping(self, prefix, uri):
  36. if not hasattr(self, '_xmlns_attrs'):
  37. self._xmlns_attrs = []
  38. self._xmlns_attrs.append((prefix or 'xmlns', uri))
  39. self._ns_contexts.append(self._current_context.copy())
  40. self._current_context[uri] = prefix or None
  41. def endPrefixMapping(self, prefix):
  42. self._current_context = self._ns_contexts.pop()
  43. def startElementNS(self, name, tagName , attrs):
  44. # Retrieve xml namespace declaration attributes.
  45. xmlns_uri = 'http://www.w3.org/2000/xmlns/'
  46. xmlns_attrs = getattr(self, '_xmlns_attrs', None)
  47. if xmlns_attrs is not None:
  48. for aname, value in xmlns_attrs:
  49. attrs._attrs[(xmlns_uri, aname)] = value
  50. self._xmlns_attrs = []
  51. uri, localname = name
  52. if uri:
  53. # When using namespaces, the reader may or may not
  54. # provide us with the original name. If not, create
  55. # *a* valid tagName from the current context.
  56. if tagName is None:
  57. prefix = self._current_context[uri]
  58. if prefix:
  59. tagName = prefix + ":" + localname
  60. else:
  61. tagName = localname
  62. if self.document:
  63. node = self.document.createElementNS(uri, tagName)
  64. else:
  65. node = self.buildDocument(uri, tagName)
  66. else:
  67. # When the tagname is not prefixed, it just appears as
  68. # localname
  69. if self.document:
  70. node = self.document.createElement(localname)
  71. else:
  72. node = self.buildDocument(None, localname)
  73. for aname,value in attrs.items():
  74. a_uri, a_localname = aname
  75. if a_uri == xmlns_uri:
  76. if a_localname == 'xmlns':
  77. qname = a_localname
  78. else:
  79. qname = 'xmlns:' + a_localname
  80. attr = self.document.createAttributeNS(a_uri, qname)
  81. node.setAttributeNodeNS(attr)
  82. elif a_uri:
  83. prefix = self._current_context[a_uri]
  84. if prefix:
  85. qname = prefix + ":" + a_localname
  86. else:
  87. qname = a_localname
  88. attr = self.document.createAttributeNS(a_uri, qname)
  89. node.setAttributeNodeNS(attr)
  90. else:
  91. attr = self.document.createAttribute(a_localname)
  92. node.setAttributeNode(attr)
  93. attr.value = value
  94. self.lastEvent[1] = [(START_ELEMENT, node), None]
  95. self.lastEvent = self.lastEvent[1]
  96. self.push(node)
  97. def endElementNS(self, name, tagName):
  98. self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
  99. self.lastEvent = self.lastEvent[1]
  100. def startElement(self, name, attrs):
  101. if self.document:
  102. node = self.document.createElement(name)
  103. else:
  104. node = self.buildDocument(None, name)
  105. for aname,value in attrs.items():
  106. attr = self.document.createAttribute(aname)
  107. attr.value = value
  108. node.setAttributeNode(attr)
  109. self.lastEvent[1] = [(START_ELEMENT, node), None]
  110. self.lastEvent = self.lastEvent[1]
  111. self.push(node)
  112. def endElement(self, name):
  113. self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
  114. self.lastEvent = self.lastEvent[1]
  115. def comment(self, s):
  116. if self.document:
  117. node = self.document.createComment(s)
  118. self.lastEvent[1] = [(COMMENT, node), None]
  119. self.lastEvent = self.lastEvent[1]
  120. else:
  121. event = [(COMMENT, s), None]
  122. self.pending_events.append(event)
  123. def processingInstruction(self, target, data):
  124. if self.document:
  125. node = self.document.createProcessingInstruction(target, data)
  126. self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
  127. self.lastEvent = self.lastEvent[1]
  128. else:
  129. event = [(PROCESSING_INSTRUCTION, target, data), None]
  130. self.pending_events.append(event)
  131. def ignorableWhitespace(self, chars):
  132. node = self.document.createTextNode(chars)
  133. self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
  134. self.lastEvent = self.lastEvent[1]
  135. def characters(self, chars):
  136. node = self.document.createTextNode(chars)
  137. self.lastEvent[1] = [(CHARACTERS, node), None]
  138. self.lastEvent = self.lastEvent[1]
  139. def startDocument(self):
  140. if self.documentFactory is None:
  141. import xml.dom.minidom
  142. self.documentFactory = xml.dom.minidom.Document.implementation
  143. def buildDocument(self, uri, tagname):
  144. # Can't do that in startDocument, since we need the tagname
  145. # XXX: obtain DocumentType
  146. node = self.documentFactory.createDocument(uri, tagname, None)
  147. self.document = node
  148. self.lastEvent[1] = [(START_DOCUMENT, node), None]
  149. self.lastEvent = self.lastEvent[1]
  150. self.push(node)
  151. # Put everything we have seen so far into the document
  152. for e in self.pending_events:
  153. if e[0][0] == PROCESSING_INSTRUCTION:
  154. _,target,data = e[0]
  155. n = self.document.createProcessingInstruction(target, data)
  156. e[0] = (PROCESSING_INSTRUCTION, n)
  157. elif e[0][0] == COMMENT:
  158. n = self.document.createComment(e[0][1])
  159. e[0] = (COMMENT, n)
  160. else:
  161. raise AssertionError("Unknown pending event ",e[0][0])
  162. self.lastEvent[1] = e
  163. self.lastEvent = e
  164. self.pending_events = None
  165. return node.firstChild
  166. def endDocument(self):
  167. self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
  168. self.pop()
  169. def clear(self):
  170. "clear(): Explicitly release parsing structures"
  171. self.document = None
  172. class ErrorHandler:
  173. def warning(self, exception):
  174. print(exception)
  175. def error(self, exception):
  176. raise exception
  177. def fatalError(self, exception):
  178. raise exception
  179. class DOMEventStream:
  180. def __init__(self, stream, parser, bufsize):
  181. self.stream = stream
  182. self.parser = parser
  183. self.bufsize = bufsize
  184. if not hasattr(self.parser, 'feed'):
  185. self.getEvent = self._slurp
  186. self.reset()
  187. def reset(self):
  188. self.pulldom = PullDOM()
  189. # This content handler relies on namespace support
  190. self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
  191. self.parser.setContentHandler(self.pulldom)
  192. def __getitem__(self, pos):
  193. rc = self.getEvent()
  194. if rc:
  195. return rc
  196. raise IndexError
  197. def __next__(self):
  198. rc = self.getEvent()
  199. if rc:
  200. return rc
  201. raise StopIteration
  202. def __iter__(self):
  203. return self
  204. def expandNode(self, node):
  205. event = self.getEvent()
  206. parents = [node]
  207. while event:
  208. token, cur_node = event
  209. if cur_node is node:
  210. return
  211. if token != END_ELEMENT:
  212. parents[-1].appendChild(cur_node)
  213. if token == START_ELEMENT:
  214. parents.append(cur_node)
  215. elif token == END_ELEMENT:
  216. del parents[-1]
  217. event = self.getEvent()
  218. def getEvent(self):
  219. # use IncrementalParser interface, so we get the desired
  220. # pull effect
  221. if not self.pulldom.firstEvent[1]:
  222. self.pulldom.lastEvent = self.pulldom.firstEvent
  223. while not self.pulldom.firstEvent[1]:
  224. buf = self.stream.read(self.bufsize)
  225. if not buf:
  226. self.parser.close()
  227. return None
  228. self.parser.feed(buf)
  229. rc = self.pulldom.firstEvent[1][0]
  230. self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
  231. return rc
  232. def _slurp(self):
  233. """ Fallback replacement for getEvent() using the
  234. standard SAX2 interface, which means we slurp the
  235. SAX events into memory (no performance gain, but
  236. we are compatible to all SAX parsers).
  237. """
  238. self.parser.parse(self.stream)
  239. self.getEvent = self._emit
  240. return self._emit()
  241. def _emit(self):
  242. """ Fallback replacement for getEvent() that emits
  243. the events that _slurp() read previously.
  244. """
  245. rc = self.pulldom.firstEvent[1][0]
  246. self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
  247. return rc
  248. def clear(self):
  249. """clear(): Explicitly release parsing objects"""
  250. self.pulldom.clear()
  251. del self.pulldom
  252. self.parser = None
  253. self.stream = None
  254. class SAX2DOM(PullDOM):
  255. def startElementNS(self, name, tagName , attrs):
  256. PullDOM.startElementNS(self, name, tagName, attrs)
  257. curNode = self.elementStack[-1]
  258. parentNode = self.elementStack[-2]
  259. parentNode.appendChild(curNode)
  260. def startElement(self, name, attrs):
  261. PullDOM.startElement(self, name, attrs)
  262. curNode = self.elementStack[-1]
  263. parentNode = self.elementStack[-2]
  264. parentNode.appendChild(curNode)
  265. def processingInstruction(self, target, data):
  266. PullDOM.processingInstruction(self, target, data)
  267. node = self.lastEvent[0][1]
  268. parentNode = self.elementStack[-1]
  269. parentNode.appendChild(node)
  270. def ignorableWhitespace(self, chars):
  271. PullDOM.ignorableWhitespace(self, chars)
  272. node = self.lastEvent[0][1]
  273. parentNode = self.elementStack[-1]
  274. parentNode.appendChild(node)
  275. def characters(self, chars):
  276. PullDOM.characters(self, chars)
  277. node = self.lastEvent[0][1]
  278. parentNode = self.elementStack[-1]
  279. parentNode.appendChild(node)
  280. default_bufsize = (2 ** 14) - 20
  281. def parse(stream_or_string, parser=None, bufsize=None):
  282. if bufsize is None:
  283. bufsize = default_bufsize
  284. if isinstance(stream_or_string, str):
  285. stream = open(stream_or_string, 'rb')
  286. else:
  287. stream = stream_or_string
  288. if not parser:
  289. parser = xml.sax.make_parser()
  290. return DOMEventStream(stream, parser, bufsize)
  291. def parseString(string, parser=None):
  292. from io import StringIO
  293. bufsize = len(string)
  294. buf = StringIO(string)
  295. if not parser:
  296. parser = xml.sax.make_parser()
  297. return DOMEventStream(buf, parser, bufsize)