pulldom.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. import xml.sax
  2. import xml.sax.handler
  3. import types
  4. try:
  5. _StringTypes = [types.StringType, types.UnicodeType]
  6. except AttributeError:
  7. _StringTypes = [types.StringType]
  8. START_ELEMENT = "START_ELEMENT"
  9. END_ELEMENT = "END_ELEMENT"
  10. COMMENT = "COMMENT"
  11. START_DOCUMENT = "START_DOCUMENT"
  12. END_DOCUMENT = "END_DOCUMENT"
  13. PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
  14. IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
  15. CHARACTERS = "CHARACTERS"
  16. class PullDOM(xml.sax.ContentHandler):
  17. _locator = None
  18. document = None
  19. def __init__(self, documentFactory=None):
  20. from xml.dom import XML_NAMESPACE
  21. self.documentFactory = documentFactory
  22. self.firstEvent = [None, None]
  23. self.lastEvent = self.firstEvent
  24. self.elementStack = []
  25. self.push = self.elementStack.append
  26. try:
  27. self.pop = self.elementStack.pop
  28. except AttributeError:
  29. # use class' pop instead
  30. pass
  31. self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
  32. self._current_context = self._ns_contexts[-1]
  33. self.pending_events = []
  34. def pop(self):
  35. result = self.elementStack[-1]
  36. del self.elementStack[-1]
  37. return result
  38. def setDocumentLocator(self, locator):
  39. self._locator = locator
  40. def startPrefixMapping(self, prefix, uri):
  41. if not hasattr(self, '_xmlns_attrs'):
  42. self._xmlns_attrs = []
  43. self._xmlns_attrs.append((prefix or 'xmlns', uri))
  44. self._ns_contexts.append(self._current_context.copy())
  45. self._current_context[uri] = prefix or None
  46. def endPrefixMapping(self, prefix):
  47. self._current_context = self._ns_contexts.pop()
  48. def startElementNS(self, name, tagName , attrs):
  49. # Retrieve xml namespace declaration attributes.
  50. xmlns_uri = 'http://www.w3.org/2000/xmlns/'
  51. xmlns_attrs = getattr(self, '_xmlns_attrs', None)
  52. if xmlns_attrs is not None:
  53. for aname, value in xmlns_attrs:
  54. attrs._attrs[(xmlns_uri, aname)] = value
  55. self._xmlns_attrs = []
  56. uri, localname = name
  57. if uri:
  58. # When using namespaces, the reader may or may not
  59. # provide us with the original name. If not, create
  60. # *a* valid tagName from the current context.
  61. if tagName is None:
  62. prefix = self._current_context[uri]
  63. if prefix:
  64. tagName = prefix + ":" + localname
  65. else:
  66. tagName = localname
  67. if self.document:
  68. node = self.document.createElementNS(uri, tagName)
  69. else:
  70. node = self.buildDocument(uri, tagName)
  71. else:
  72. # When the tagname is not prefixed, it just appears as
  73. # localname
  74. if self.document:
  75. node = self.document.createElement(localname)
  76. else:
  77. node = self.buildDocument(None, localname)
  78. for aname,value in attrs.items():
  79. a_uri, a_localname = aname
  80. if a_uri == xmlns_uri:
  81. if a_localname == 'xmlns':
  82. qname = a_localname
  83. else:
  84. qname = 'xmlns:' + a_localname
  85. attr = self.document.createAttributeNS(a_uri, qname)
  86. node.setAttributeNodeNS(attr)
  87. elif a_uri:
  88. prefix = self._current_context[a_uri]
  89. if prefix:
  90. qname = prefix + ":" + a_localname
  91. else:
  92. qname = a_localname
  93. attr = self.document.createAttributeNS(a_uri, qname)
  94. node.setAttributeNodeNS(attr)
  95. else:
  96. attr = self.document.createAttribute(a_localname)
  97. node.setAttributeNode(attr)
  98. attr.value = value
  99. self.lastEvent[1] = [(START_ELEMENT, node), None]
  100. self.lastEvent = self.lastEvent[1]
  101. self.push(node)
  102. def endElementNS(self, name, tagName):
  103. self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
  104. self.lastEvent = self.lastEvent[1]
  105. def startElement(self, name, attrs):
  106. if self.document:
  107. node = self.document.createElement(name)
  108. else:
  109. node = self.buildDocument(None, name)
  110. for aname,value in attrs.items():
  111. attr = self.document.createAttribute(aname)
  112. attr.value = value
  113. node.setAttributeNode(attr)
  114. self.lastEvent[1] = [(START_ELEMENT, node), None]
  115. self.lastEvent = self.lastEvent[1]
  116. self.push(node)
  117. def endElement(self, name):
  118. self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
  119. self.lastEvent = self.lastEvent[1]
  120. def comment(self, s):
  121. if self.document:
  122. node = self.document.createComment(s)
  123. self.lastEvent[1] = [(COMMENT, node), None]
  124. self.lastEvent = self.lastEvent[1]
  125. else:
  126. event = [(COMMENT, s), None]
  127. self.pending_events.append(event)
  128. def processingInstruction(self, target, data):
  129. if self.document:
  130. node = self.document.createProcessingInstruction(target, data)
  131. self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
  132. self.lastEvent = self.lastEvent[1]
  133. else:
  134. event = [(PROCESSING_INSTRUCTION, target, data), None]
  135. self.pending_events.append(event)
  136. def ignorableWhitespace(self, chars):
  137. node = self.document.createTextNode(chars)
  138. self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
  139. self.lastEvent = self.lastEvent[1]
  140. def characters(self, chars):
  141. node = self.document.createTextNode(chars)
  142. self.lastEvent[1] = [(CHARACTERS, node), None]
  143. self.lastEvent = self.lastEvent[1]
  144. def startDocument(self):
  145. if self.documentFactory is None:
  146. import xml.dom.minidom
  147. self.documentFactory = xml.dom.minidom.Document.implementation
  148. def buildDocument(self, uri, tagname):
  149. # Can't do that in startDocument, since we need the tagname
  150. # XXX: obtain DocumentType
  151. node = self.documentFactory.createDocument(uri, tagname, None)
  152. self.document = node
  153. self.lastEvent[1] = [(START_DOCUMENT, node), None]
  154. self.lastEvent = self.lastEvent[1]
  155. self.push(node)
  156. # Put everything we have seen so far into the document
  157. for e in self.pending_events:
  158. if e[0][0] == PROCESSING_INSTRUCTION:
  159. _,target,data = e[0]
  160. n = self.document.createProcessingInstruction(target, data)
  161. e[0] = (PROCESSING_INSTRUCTION, n)
  162. elif e[0][0] == COMMENT:
  163. n = self.document.createComment(e[0][1])
  164. e[0] = (COMMENT, n)
  165. else:
  166. raise AssertionError("Unknown pending event ",e[0][0])
  167. self.lastEvent[1] = e
  168. self.lastEvent = e
  169. self.pending_events = None
  170. return node.firstChild
  171. def endDocument(self):
  172. self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
  173. self.pop()
  174. def clear(self):
  175. "clear(): Explicitly release parsing structures"
  176. self.document = None
  177. class ErrorHandler:
  178. def warning(self, exception):
  179. print exception
  180. def error(self, exception):
  181. raise exception
  182. def fatalError(self, exception):
  183. raise exception
  184. class DOMEventStream:
  185. def __init__(self, stream, parser, bufsize):
  186. self.stream = stream
  187. self.parser = parser
  188. self.bufsize = bufsize
  189. if not hasattr(self.parser, 'feed'):
  190. self.getEvent = self._slurp
  191. self.reset()
  192. def reset(self):
  193. self.pulldom = PullDOM()
  194. # This content handler relies on namespace support
  195. self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
  196. self.parser.setContentHandler(self.pulldom)
  197. def __getitem__(self, pos):
  198. rc = self.getEvent()
  199. if rc:
  200. return rc
  201. raise IndexError
  202. def next(self):
  203. rc = self.getEvent()
  204. if rc:
  205. return rc
  206. raise StopIteration
  207. def __iter__(self):
  208. return self
  209. def expandNode(self, node):
  210. event = self.getEvent()
  211. parents = [node]
  212. while event:
  213. token, cur_node = event
  214. if cur_node is node:
  215. return
  216. if token != END_ELEMENT:
  217. parents[-1].appendChild(cur_node)
  218. if token == START_ELEMENT:
  219. parents.append(cur_node)
  220. elif token == END_ELEMENT:
  221. del parents[-1]
  222. event = self.getEvent()
  223. def getEvent(self):
  224. # use IncrementalParser interface, so we get the desired
  225. # pull effect
  226. if not self.pulldom.firstEvent[1]:
  227. self.pulldom.lastEvent = self.pulldom.firstEvent
  228. while not self.pulldom.firstEvent[1]:
  229. buf = self.stream.read(self.bufsize)
  230. if not buf:
  231. self.parser.close()
  232. return None
  233. self.parser.feed(buf)
  234. rc = self.pulldom.firstEvent[1][0]
  235. self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
  236. return rc
  237. def _slurp(self):
  238. """ Fallback replacement for getEvent() using the
  239. standard SAX2 interface, which means we slurp the
  240. SAX events into memory (no performance gain, but
  241. we are compatible to all SAX parsers).
  242. """
  243. self.parser.parse(self.stream)
  244. self.getEvent = self._emit
  245. return self._emit()
  246. def _emit(self):
  247. """ Fallback replacement for getEvent() that emits
  248. the events that _slurp() read previously.
  249. """
  250. rc = self.pulldom.firstEvent[1][0]
  251. self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
  252. return rc
  253. def clear(self):
  254. """clear(): Explicitly release parsing objects"""
  255. self.pulldom.clear()
  256. del self.pulldom
  257. self.parser = None
  258. self.stream = None
  259. class SAX2DOM(PullDOM):
  260. def startElementNS(self, name, tagName , attrs):
  261. PullDOM.startElementNS(self, name, tagName, attrs)
  262. curNode = self.elementStack[-1]
  263. parentNode = self.elementStack[-2]
  264. parentNode.appendChild(curNode)
  265. def startElement(self, name, attrs):
  266. PullDOM.startElement(self, name, attrs)
  267. curNode = self.elementStack[-1]
  268. parentNode = self.elementStack[-2]
  269. parentNode.appendChild(curNode)
  270. def processingInstruction(self, target, data):
  271. PullDOM.processingInstruction(self, target, data)
  272. node = self.lastEvent[0][1]
  273. parentNode = self.elementStack[-1]
  274. parentNode.appendChild(node)
  275. def ignorableWhitespace(self, chars):
  276. PullDOM.ignorableWhitespace(self, chars)
  277. node = self.lastEvent[0][1]
  278. parentNode = self.elementStack[-1]
  279. parentNode.appendChild(node)
  280. def characters(self, chars):
  281. PullDOM.characters(self, chars)
  282. node = self.lastEvent[0][1]
  283. parentNode = self.elementStack[-1]
  284. parentNode.appendChild(node)
  285. default_bufsize = (2 ** 14) - 20
  286. def parse(stream_or_string, parser=None, bufsize=None):
  287. if bufsize is None:
  288. bufsize = default_bufsize
  289. if type(stream_or_string) in _StringTypes:
  290. stream = open(stream_or_string)
  291. else:
  292. stream = stream_or_string
  293. if not parser:
  294. parser = xml.sax.make_parser()
  295. return DOMEventStream(stream, parser, bufsize)
  296. def parseString(string, parser=None):
  297. try:
  298. from cStringIO import StringIO
  299. except ImportError:
  300. from StringIO import StringIO
  301. bufsize = len(string)
  302. buf = StringIO(string)
  303. if not parser:
  304. parser = xml.sax.make_parser()
  305. return DOMEventStream(buf, parser, bufsize)