expatbuilder.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983
  1. """Facility to use the Expat parser to load a minidom instance
  2. from a string or file.
  3. This avoids all the overhead of SAX and pulldom to gain performance.
  4. """
  5. # Warning!
  6. #
  7. # This module is tightly bound to the implementation details of the
  8. # minidom DOM and can't be used with other DOM implementations. This
  9. # is due, in part, to a lack of appropriate methods in the DOM (there is
  10. # no way to create Entity and Notation nodes via the DOM Level 2
  11. # interface), and for performance. The later is the cause of some fairly
  12. # cryptic code.
  13. #
  14. # Performance hacks:
  15. #
  16. # - .character_data_handler() has an extra case in which continuing
  17. # data is appended to an existing Text node; this can be a
  18. # speedup since pyexpat can break up character data into multiple
  19. # callbacks even though we set the buffer_text attribute on the
  20. # parser. This also gives us the advantage that we don't need a
  21. # separate normalization pass.
  22. #
  23. # - Determining that a node exists is done using an identity comparison
  24. # with None rather than a truth test; this avoids searching for and
  25. # calling any methods on the node object if it exists. (A rather
  26. # nice speedup is achieved this way as well!)
  27. from xml.dom import xmlbuilder, minidom, Node
  28. from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
  29. from xml.parsers import expat
  30. from xml.dom.minidom import _append_child, _set_attribute_node
  31. from xml.dom.NodeFilter import NodeFilter
  32. from xml.dom.minicompat import *
  33. TEXT_NODE = Node.TEXT_NODE
  34. CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
  35. DOCUMENT_NODE = Node.DOCUMENT_NODE
  36. FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
  37. FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
  38. FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
  39. FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
  40. theDOMImplementation = minidom.getDOMImplementation()
  41. # Expat typename -> TypeInfo
  42. _typeinfo_map = {
  43. "CDATA": minidom.TypeInfo(None, "cdata"),
  44. "ENUM": minidom.TypeInfo(None, "enumeration"),
  45. "ENTITY": minidom.TypeInfo(None, "entity"),
  46. "ENTITIES": minidom.TypeInfo(None, "entities"),
  47. "ID": minidom.TypeInfo(None, "id"),
  48. "IDREF": minidom.TypeInfo(None, "idref"),
  49. "IDREFS": minidom.TypeInfo(None, "idrefs"),
  50. "NMTOKEN": minidom.TypeInfo(None, "nmtoken"),
  51. "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
  52. }
  53. class ElementInfo(object):
  54. __slots__ = '_attr_info', '_model', 'tagName'
  55. def __init__(self, tagName, model=None):
  56. self.tagName = tagName
  57. self._attr_info = []
  58. self._model = model
  59. def __getstate__(self):
  60. return self._attr_info, self._model, self.tagName
  61. def __setstate__(self, state):
  62. self._attr_info, self._model, self.tagName = state
  63. def getAttributeType(self, aname):
  64. for info in self._attr_info:
  65. if info[1] == aname:
  66. t = info[-2]
  67. if t[0] == "(":
  68. return _typeinfo_map["ENUM"]
  69. else:
  70. return _typeinfo_map[info[-2]]
  71. return minidom._no_type
  72. def getAttributeTypeNS(self, namespaceURI, localName):
  73. return minidom._no_type
  74. def isElementContent(self):
  75. if self._model:
  76. type = self._model[0]
  77. return type not in (expat.model.XML_CTYPE_ANY,
  78. expat.model.XML_CTYPE_MIXED)
  79. else:
  80. return False
  81. def isEmpty(self):
  82. if self._model:
  83. return self._model[0] == expat.model.XML_CTYPE_EMPTY
  84. else:
  85. return False
  86. def isId(self, aname):
  87. for info in self._attr_info:
  88. if info[1] == aname:
  89. return info[-2] == "ID"
  90. return False
  91. def isIdNS(self, euri, ename, auri, aname):
  92. # not sure this is meaningful
  93. return self.isId((auri, aname))
  94. def _intern(builder, s):
  95. return builder._intern_setdefault(s, s)
  96. def _parse_ns_name(builder, name):
  97. assert ' ' in name
  98. parts = name.split(' ')
  99. intern = builder._intern_setdefault
  100. if len(parts) == 3:
  101. uri, localname, prefix = parts
  102. prefix = intern(prefix, prefix)
  103. qname = "%s:%s" % (prefix, localname)
  104. qname = intern(qname, qname)
  105. localname = intern(localname, localname)
  106. else:
  107. uri, localname = parts
  108. prefix = EMPTY_PREFIX
  109. qname = localname = intern(localname, localname)
  110. return intern(uri, uri), localname, prefix, qname
  111. class ExpatBuilder:
  112. """Document builder that uses Expat to build a ParsedXML.DOM document
  113. instance."""
  114. def __init__(self, options=None):
  115. if options is None:
  116. options = xmlbuilder.Options()
  117. self._options = options
  118. if self._options.filter is not None:
  119. self._filter = FilterVisibilityController(self._options.filter)
  120. else:
  121. self._filter = None
  122. # This *really* doesn't do anything in this case, so
  123. # override it with something fast & minimal.
  124. self._finish_start_element = id
  125. self._parser = None
  126. self.reset()
  127. def createParser(self):
  128. """Create a new parser object."""
  129. return expat.ParserCreate()
  130. def getParser(self):
  131. """Return the parser object, creating a new one if needed."""
  132. if not self._parser:
  133. self._parser = self.createParser()
  134. self._intern_setdefault = self._parser.intern.setdefault
  135. self._parser.buffer_text = True
  136. self._parser.ordered_attributes = True
  137. self._parser.specified_attributes = True
  138. self.install(self._parser)
  139. return self._parser
  140. def reset(self):
  141. """Free all data structures used during DOM construction."""
  142. self.document = theDOMImplementation.createDocument(
  143. EMPTY_NAMESPACE, None, None)
  144. self.curNode = self.document
  145. self._elem_info = self.document._elem_info
  146. self._cdata = False
  147. def install(self, parser):
  148. """Install the callbacks needed to build the DOM into the parser."""
  149. # This creates circular references!
  150. parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
  151. parser.StartElementHandler = self.first_element_handler
  152. parser.EndElementHandler = self.end_element_handler
  153. parser.ProcessingInstructionHandler = self.pi_handler
  154. if self._options.entities:
  155. parser.EntityDeclHandler = self.entity_decl_handler
  156. parser.NotationDeclHandler = self.notation_decl_handler
  157. if self._options.comments:
  158. parser.CommentHandler = self.comment_handler
  159. if self._options.cdata_sections:
  160. parser.StartCdataSectionHandler = self.start_cdata_section_handler
  161. parser.EndCdataSectionHandler = self.end_cdata_section_handler
  162. parser.CharacterDataHandler = self.character_data_handler_cdata
  163. else:
  164. parser.CharacterDataHandler = self.character_data_handler
  165. parser.ExternalEntityRefHandler = self.external_entity_ref_handler
  166. parser.XmlDeclHandler = self.xml_decl_handler
  167. parser.ElementDeclHandler = self.element_decl_handler
  168. parser.AttlistDeclHandler = self.attlist_decl_handler
  169. def parseFile(self, file):
  170. """Parse a document from a file object, returning the document
  171. node."""
  172. parser = self.getParser()
  173. first_buffer = True
  174. try:
  175. while 1:
  176. buffer = file.read(16*1024)
  177. if not buffer:
  178. break
  179. parser.Parse(buffer, 0)
  180. if first_buffer and self.document.documentElement:
  181. self._setup_subset(buffer)
  182. first_buffer = False
  183. parser.Parse("", True)
  184. except ParseEscape:
  185. pass
  186. doc = self.document
  187. self.reset()
  188. self._parser = None
  189. return doc
  190. def parseString(self, string):
  191. """Parse a document from a string, returning the document node."""
  192. parser = self.getParser()
  193. try:
  194. parser.Parse(string, True)
  195. self._setup_subset(string)
  196. except ParseEscape:
  197. pass
  198. doc = self.document
  199. self.reset()
  200. self._parser = None
  201. return doc
  202. def _setup_subset(self, buffer):
  203. """Load the internal subset if there might be one."""
  204. if self.document.doctype:
  205. extractor = InternalSubsetExtractor()
  206. extractor.parseString(buffer)
  207. subset = extractor.getSubset()
  208. self.document.doctype.internalSubset = subset
  209. def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
  210. has_internal_subset):
  211. doctype = self.document.implementation.createDocumentType(
  212. doctypeName, publicId, systemId)
  213. doctype.ownerDocument = self.document
  214. _append_child(self.document, doctype)
  215. self.document.doctype = doctype
  216. if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
  217. self.document.doctype = None
  218. del self.document.childNodes[-1]
  219. doctype = None
  220. self._parser.EntityDeclHandler = None
  221. self._parser.NotationDeclHandler = None
  222. if has_internal_subset:
  223. if doctype is not None:
  224. doctype.entities._seq = []
  225. doctype.notations._seq = []
  226. self._parser.CommentHandler = None
  227. self._parser.ProcessingInstructionHandler = None
  228. self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
  229. def end_doctype_decl_handler(self):
  230. if self._options.comments:
  231. self._parser.CommentHandler = self.comment_handler
  232. self._parser.ProcessingInstructionHandler = self.pi_handler
  233. if not (self._elem_info or self._filter):
  234. self._finish_end_element = id
  235. def pi_handler(self, target, data):
  236. node = self.document.createProcessingInstruction(target, data)
  237. _append_child(self.curNode, node)
  238. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  239. self.curNode.removeChild(node)
  240. def character_data_handler_cdata(self, data):
  241. childNodes = self.curNode.childNodes
  242. if self._cdata:
  243. if ( self._cdata_continue
  244. and childNodes[-1].nodeType == CDATA_SECTION_NODE):
  245. childNodes[-1].appendData(data)
  246. return
  247. node = self.document.createCDATASection(data)
  248. self._cdata_continue = True
  249. elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
  250. node = childNodes[-1]
  251. value = node.data + data
  252. d = node.__dict__
  253. d['data'] = d['nodeValue'] = value
  254. return
  255. else:
  256. node = minidom.Text()
  257. d = node.__dict__
  258. d['data'] = d['nodeValue'] = data
  259. d['ownerDocument'] = self.document
  260. _append_child(self.curNode, node)
  261. def character_data_handler(self, data):
  262. childNodes = self.curNode.childNodes
  263. if childNodes and childNodes[-1].nodeType == TEXT_NODE:
  264. node = childNodes[-1]
  265. d = node.__dict__
  266. d['data'] = d['nodeValue'] = node.data + data
  267. return
  268. node = minidom.Text()
  269. d = node.__dict__
  270. d['data'] = d['nodeValue'] = node.data + data
  271. d['ownerDocument'] = self.document
  272. _append_child(self.curNode, node)
  273. def entity_decl_handler(self, entityName, is_parameter_entity, value,
  274. base, systemId, publicId, notationName):
  275. if is_parameter_entity:
  276. # we don't care about parameter entities for the DOM
  277. return
  278. if not self._options.entities:
  279. return
  280. node = self.document._create_entity(entityName, publicId,
  281. systemId, notationName)
  282. if value is not None:
  283. # internal entity
  284. # node *should* be readonly, but we'll cheat
  285. child = self.document.createTextNode(value)
  286. node.childNodes.append(child)
  287. self.document.doctype.entities._seq.append(node)
  288. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  289. del self.document.doctype.entities._seq[-1]
  290. def notation_decl_handler(self, notationName, base, systemId, publicId):
  291. node = self.document._create_notation(notationName, publicId, systemId)
  292. self.document.doctype.notations._seq.append(node)
  293. if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
  294. del self.document.doctype.notations._seq[-1]
  295. def comment_handler(self, data):
  296. node = self.document.createComment(data)
  297. _append_child(self.curNode, node)
  298. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  299. self.curNode.removeChild(node)
  300. def start_cdata_section_handler(self):
  301. self._cdata = True
  302. self._cdata_continue = False
  303. def end_cdata_section_handler(self):
  304. self._cdata = False
  305. self._cdata_continue = False
  306. def external_entity_ref_handler(self, context, base, systemId, publicId):
  307. return 1
  308. def first_element_handler(self, name, attributes):
  309. if self._filter is None and not self._elem_info:
  310. self._finish_end_element = id
  311. self.getParser().StartElementHandler = self.start_element_handler
  312. self.start_element_handler(name, attributes)
  313. def start_element_handler(self, name, attributes):
  314. node = self.document.createElement(name)
  315. _append_child(self.curNode, node)
  316. self.curNode = node
  317. if attributes:
  318. for i in range(0, len(attributes), 2):
  319. a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
  320. None, EMPTY_PREFIX)
  321. value = attributes[i+1]
  322. d = a.childNodes[0].__dict__
  323. d['data'] = d['nodeValue'] = value
  324. d = a.__dict__
  325. d['value'] = d['nodeValue'] = value
  326. d['ownerDocument'] = self.document
  327. _set_attribute_node(node, a)
  328. if node is not self.document.documentElement:
  329. self._finish_start_element(node)
  330. def _finish_start_element(self, node):
  331. if self._filter:
  332. # To be general, we'd have to call isSameNode(), but this
  333. # is sufficient for minidom:
  334. if node is self.document.documentElement:
  335. return
  336. filt = self._filter.startContainer(node)
  337. if filt == FILTER_REJECT:
  338. # ignore this node & all descendents
  339. Rejecter(self)
  340. elif filt == FILTER_SKIP:
  341. # ignore this node, but make it's children become
  342. # children of the parent node
  343. Skipper(self)
  344. else:
  345. return
  346. self.curNode = node.parentNode
  347. node.parentNode.removeChild(node)
  348. node.unlink()
  349. # If this ever changes, Namespaces.end_element_handler() needs to
  350. # be changed to match.
  351. #
  352. def end_element_handler(self, name):
  353. curNode = self.curNode
  354. self.curNode = curNode.parentNode
  355. self._finish_end_element(curNode)
  356. def _finish_end_element(self, curNode):
  357. info = self._elem_info.get(curNode.tagName)
  358. if info:
  359. self._handle_white_text_nodes(curNode, info)
  360. if self._filter:
  361. if curNode is self.document.documentElement:
  362. return
  363. if self._filter.acceptNode(curNode) == FILTER_REJECT:
  364. self.curNode.removeChild(curNode)
  365. curNode.unlink()
  366. def _handle_white_text_nodes(self, node, info):
  367. if (self._options.whitespace_in_element_content
  368. or not info.isElementContent()):
  369. return
  370. # We have element type information and should remove ignorable
  371. # whitespace; identify for text nodes which contain only
  372. # whitespace.
  373. L = []
  374. for child in node.childNodes:
  375. if child.nodeType == TEXT_NODE and not child.data.strip():
  376. L.append(child)
  377. # Remove ignorable whitespace from the tree.
  378. for child in L:
  379. node.removeChild(child)
  380. def element_decl_handler(self, name, model):
  381. info = self._elem_info.get(name)
  382. if info is None:
  383. self._elem_info[name] = ElementInfo(name, model)
  384. else:
  385. assert info._model is None
  386. info._model = model
  387. def attlist_decl_handler(self, elem, name, type, default, required):
  388. info = self._elem_info.get(elem)
  389. if info is None:
  390. info = ElementInfo(elem)
  391. self._elem_info[elem] = info
  392. info._attr_info.append(
  393. [None, name, None, None, default, 0, type, required])
  394. def xml_decl_handler(self, version, encoding, standalone):
  395. self.document.version = version
  396. self.document.encoding = encoding
  397. # This is still a little ugly, thanks to the pyexpat API. ;-(
  398. if standalone >= 0:
  399. if standalone:
  400. self.document.standalone = True
  401. else:
  402. self.document.standalone = False
  403. # Don't include FILTER_INTERRUPT, since that's checked separately
  404. # where allowed.
  405. _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
  406. class FilterVisibilityController(object):
  407. """Wrapper around a DOMBuilderFilter which implements the checks
  408. to make the whatToShow filter attribute work."""
  409. __slots__ = 'filter',
  410. def __init__(self, filter):
  411. self.filter = filter
  412. def startContainer(self, node):
  413. mask = self._nodetype_mask[node.nodeType]
  414. if self.filter.whatToShow & mask:
  415. val = self.filter.startContainer(node)
  416. if val == FILTER_INTERRUPT:
  417. raise ParseEscape
  418. if val not in _ALLOWED_FILTER_RETURNS:
  419. raise ValueError, \
  420. "startContainer() returned illegal value: " + repr(val)
  421. return val
  422. else:
  423. return FILTER_ACCEPT
  424. def acceptNode(self, node):
  425. mask = self._nodetype_mask[node.nodeType]
  426. if self.filter.whatToShow & mask:
  427. val = self.filter.acceptNode(node)
  428. if val == FILTER_INTERRUPT:
  429. raise ParseEscape
  430. if val == FILTER_SKIP:
  431. # move all child nodes to the parent, and remove this node
  432. parent = node.parentNode
  433. for child in node.childNodes[:]:
  434. parent.appendChild(child)
  435. # node is handled by the caller
  436. return FILTER_REJECT
  437. if val not in _ALLOWED_FILTER_RETURNS:
  438. raise ValueError, \
  439. "acceptNode() returned illegal value: " + repr(val)
  440. return val
  441. else:
  442. return FILTER_ACCEPT
  443. _nodetype_mask = {
  444. Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT,
  445. Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE,
  446. Node.TEXT_NODE: NodeFilter.SHOW_TEXT,
  447. Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION,
  448. Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE,
  449. Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY,
  450. Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
  451. Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT,
  452. Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT,
  453. Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE,
  454. Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT,
  455. Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION,
  456. }
  457. class FilterCrutch(object):
  458. __slots__ = '_builder', '_level', '_old_start', '_old_end'
  459. def __init__(self, builder):
  460. self._level = 0
  461. self._builder = builder
  462. parser = builder._parser
  463. self._old_start = parser.StartElementHandler
  464. self._old_end = parser.EndElementHandler
  465. parser.StartElementHandler = self.start_element_handler
  466. parser.EndElementHandler = self.end_element_handler
  467. class Rejecter(FilterCrutch):
  468. __slots__ = ()
  469. def __init__(self, builder):
  470. FilterCrutch.__init__(self, builder)
  471. parser = builder._parser
  472. for name in ("ProcessingInstructionHandler",
  473. "CommentHandler",
  474. "CharacterDataHandler",
  475. "StartCdataSectionHandler",
  476. "EndCdataSectionHandler",
  477. "ExternalEntityRefHandler",
  478. ):
  479. setattr(parser, name, None)
  480. def start_element_handler(self, *args):
  481. self._level = self._level + 1
  482. def end_element_handler(self, *args):
  483. if self._level == 0:
  484. # restore the old handlers
  485. parser = self._builder._parser
  486. self._builder.install(parser)
  487. parser.StartElementHandler = self._old_start
  488. parser.EndElementHandler = self._old_end
  489. else:
  490. self._level = self._level - 1
  491. class Skipper(FilterCrutch):
  492. __slots__ = ()
  493. def start_element_handler(self, *args):
  494. node = self._builder.curNode
  495. self._old_start(*args)
  496. if self._builder.curNode is not node:
  497. self._level = self._level + 1
  498. def end_element_handler(self, *args):
  499. if self._level == 0:
  500. # We're popping back out of the node we're skipping, so we
  501. # shouldn't need to do anything but reset the handlers.
  502. self._builder._parser.StartElementHandler = self._old_start
  503. self._builder._parser.EndElementHandler = self._old_end
  504. self._builder = None
  505. else:
  506. self._level = self._level - 1
  507. self._old_end(*args)
  508. # framework document used by the fragment builder.
  509. # Takes a string for the doctype, subset string, and namespace attrs string.
  510. _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
  511. "http://xml.python.org/entities/fragment-builder/internal"
  512. _FRAGMENT_BUILDER_TEMPLATE = (
  513. '''\
  514. <!DOCTYPE wrapper
  515. %%s [
  516. <!ENTITY fragment-builder-internal
  517. SYSTEM "%s">
  518. %%s
  519. ]>
  520. <wrapper %%s
  521. >&fragment-builder-internal;</wrapper>'''
  522. % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
  523. class FragmentBuilder(ExpatBuilder):
  524. """Builder which constructs document fragments given XML source
  525. text and a context node.
  526. The context node is expected to provide information about the
  527. namespace declarations which are in scope at the start of the
  528. fragment.
  529. """
  530. def __init__(self, context, options=None):
  531. if context.nodeType == DOCUMENT_NODE:
  532. self.originalDocument = context
  533. self.context = context
  534. else:
  535. self.originalDocument = context.ownerDocument
  536. self.context = context
  537. ExpatBuilder.__init__(self, options)
  538. def reset(self):
  539. ExpatBuilder.reset(self)
  540. self.fragment = None
  541. def parseFile(self, file):
  542. """Parse a document fragment from a file object, returning the
  543. fragment node."""
  544. return self.parseString(file.read())
  545. def parseString(self, string):
  546. """Parse a document fragment from a string, returning the
  547. fragment node."""
  548. self._source = string
  549. parser = self.getParser()
  550. doctype = self.originalDocument.doctype
  551. ident = ""
  552. if doctype:
  553. subset = doctype.internalSubset or self._getDeclarations()
  554. if doctype.publicId:
  555. ident = ('PUBLIC "%s" "%s"'
  556. % (doctype.publicId, doctype.systemId))
  557. elif doctype.systemId:
  558. ident = 'SYSTEM "%s"' % doctype.systemId
  559. else:
  560. subset = ""
  561. nsattrs = self._getNSattrs() # get ns decls from node's ancestors
  562. document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
  563. try:
  564. parser.Parse(document, 1)
  565. except:
  566. self.reset()
  567. raise
  568. fragment = self.fragment
  569. self.reset()
  570. ## self._parser = None
  571. return fragment
  572. def _getDeclarations(self):
  573. """Re-create the internal subset from the DocumentType node.
  574. This is only needed if we don't already have the
  575. internalSubset as a string.
  576. """
  577. doctype = self.context.ownerDocument.doctype
  578. s = ""
  579. if doctype:
  580. for i in range(doctype.notations.length):
  581. notation = doctype.notations.item(i)
  582. if s:
  583. s = s + "\n "
  584. s = "%s<!NOTATION %s" % (s, notation.nodeName)
  585. if notation.publicId:
  586. s = '%s PUBLIC "%s"\n "%s">' \
  587. % (s, notation.publicId, notation.systemId)
  588. else:
  589. s = '%s SYSTEM "%s">' % (s, notation.systemId)
  590. for i in range(doctype.entities.length):
  591. entity = doctype.entities.item(i)
  592. if s:
  593. s = s + "\n "
  594. s = "%s<!ENTITY %s" % (s, entity.nodeName)
  595. if entity.publicId:
  596. s = '%s PUBLIC "%s"\n "%s"' \
  597. % (s, entity.publicId, entity.systemId)
  598. elif entity.systemId:
  599. s = '%s SYSTEM "%s"' % (s, entity.systemId)
  600. else:
  601. s = '%s "%s"' % (s, entity.firstChild.data)
  602. if entity.notationName:
  603. s = "%s NOTATION %s" % (s, entity.notationName)
  604. s = s + ">"
  605. return s
  606. def _getNSattrs(self):
  607. return ""
  608. def external_entity_ref_handler(self, context, base, systemId, publicId):
  609. if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
  610. # this entref is the one that we made to put the subtree
  611. # in; all of our given input is parsed in here.
  612. old_document = self.document
  613. old_cur_node = self.curNode
  614. parser = self._parser.ExternalEntityParserCreate(context)
  615. # put the real document back, parse into the fragment to return
  616. self.document = self.originalDocument
  617. self.fragment = self.document.createDocumentFragment()
  618. self.curNode = self.fragment
  619. try:
  620. parser.Parse(self._source, 1)
  621. finally:
  622. self.curNode = old_cur_node
  623. self.document = old_document
  624. self._source = None
  625. return -1
  626. else:
  627. return ExpatBuilder.external_entity_ref_handler(
  628. self, context, base, systemId, publicId)
  629. class Namespaces:
  630. """Mix-in class for builders; adds support for namespaces."""
  631. def _initNamespaces(self):
  632. # list of (prefix, uri) ns declarations. Namespace attrs are
  633. # constructed from this and added to the element's attrs.
  634. self._ns_ordered_prefixes = []
  635. def createParser(self):
  636. """Create a new namespace-handling parser."""
  637. parser = expat.ParserCreate(namespace_separator=" ")
  638. parser.namespace_prefixes = True
  639. return parser
  640. def install(self, parser):
  641. """Insert the namespace-handlers onto the parser."""
  642. ExpatBuilder.install(self, parser)
  643. if self._options.namespace_declarations:
  644. parser.StartNamespaceDeclHandler = (
  645. self.start_namespace_decl_handler)
  646. def start_namespace_decl_handler(self, prefix, uri):
  647. """Push this namespace declaration on our storage."""
  648. self._ns_ordered_prefixes.append((prefix, uri))
  649. def start_element_handler(self, name, attributes):
  650. if ' ' in name:
  651. uri, localname, prefix, qname = _parse_ns_name(self, name)
  652. else:
  653. uri = EMPTY_NAMESPACE
  654. qname = name
  655. localname = None
  656. prefix = EMPTY_PREFIX
  657. node = minidom.Element(qname, uri, prefix, localname)
  658. node.ownerDocument = self.document
  659. _append_child(self.curNode, node)
  660. self.curNode = node
  661. if self._ns_ordered_prefixes:
  662. for prefix, uri in self._ns_ordered_prefixes:
  663. if prefix:
  664. a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
  665. XMLNS_NAMESPACE, prefix, "xmlns")
  666. else:
  667. a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
  668. "xmlns", EMPTY_PREFIX)
  669. d = a.childNodes[0].__dict__
  670. d['data'] = d['nodeValue'] = uri
  671. d = a.__dict__
  672. d['value'] = d['nodeValue'] = uri
  673. d['ownerDocument'] = self.document
  674. _set_attribute_node(node, a)
  675. del self._ns_ordered_prefixes[:]
  676. if attributes:
  677. _attrs = node._attrs
  678. _attrsNS = node._attrsNS
  679. for i in range(0, len(attributes), 2):
  680. aname = attributes[i]
  681. value = attributes[i+1]
  682. if ' ' in aname:
  683. uri, localname, prefix, qname = _parse_ns_name(self, aname)
  684. a = minidom.Attr(qname, uri, localname, prefix)
  685. _attrs[qname] = a
  686. _attrsNS[(uri, localname)] = a
  687. else:
  688. a = minidom.Attr(aname, EMPTY_NAMESPACE,
  689. aname, EMPTY_PREFIX)
  690. _attrs[aname] = a
  691. _attrsNS[(EMPTY_NAMESPACE, aname)] = a
  692. d = a.childNodes[0].__dict__
  693. d['data'] = d['nodeValue'] = value
  694. d = a.__dict__
  695. d['ownerDocument'] = self.document
  696. d['value'] = d['nodeValue'] = value
  697. d['ownerElement'] = node
  698. if __debug__:
  699. # This only adds some asserts to the original
  700. # end_element_handler(), so we only define this when -O is not
  701. # used. If changing one, be sure to check the other to see if
  702. # it needs to be changed as well.
  703. #
  704. def end_element_handler(self, name):
  705. curNode = self.curNode
  706. if ' ' in name:
  707. uri, localname, prefix, qname = _parse_ns_name(self, name)
  708. assert (curNode.namespaceURI == uri
  709. and curNode.localName == localname
  710. and curNode.prefix == prefix), \
  711. "element stack messed up! (namespace)"
  712. else:
  713. assert curNode.nodeName == name, \
  714. "element stack messed up - bad nodeName"
  715. assert curNode.namespaceURI == EMPTY_NAMESPACE, \
  716. "element stack messed up - bad namespaceURI"
  717. self.curNode = curNode.parentNode
  718. self._finish_end_element(curNode)
  719. class ExpatBuilderNS(Namespaces, ExpatBuilder):
  720. """Document builder that supports namespaces."""
  721. def reset(self):
  722. ExpatBuilder.reset(self)
  723. self._initNamespaces()
  724. class FragmentBuilderNS(Namespaces, FragmentBuilder):
  725. """Fragment builder that supports namespaces."""
  726. def reset(self):
  727. FragmentBuilder.reset(self)
  728. self._initNamespaces()
  729. def _getNSattrs(self):
  730. """Return string of namespace attributes from this element and
  731. ancestors."""
  732. # XXX This needs to be re-written to walk the ancestors of the
  733. # context to build up the namespace information from
  734. # declarations, elements, and attributes found in context.
  735. # Otherwise we have to store a bunch more data on the DOM
  736. # (though that *might* be more reliable -- not clear).
  737. attrs = ""
  738. context = self.context
  739. L = []
  740. while context:
  741. if hasattr(context, '_ns_prefix_uri'):
  742. for prefix, uri in context._ns_prefix_uri.items():
  743. # add every new NS decl from context to L and attrs string
  744. if prefix in L:
  745. continue
  746. L.append(prefix)
  747. if prefix:
  748. declname = "xmlns:" + prefix
  749. else:
  750. declname = "xmlns"
  751. if attrs:
  752. attrs = "%s\n %s='%s'" % (attrs, declname, uri)
  753. else:
  754. attrs = " %s='%s'" % (declname, uri)
  755. context = context.parentNode
  756. return attrs
  757. class ParseEscape(Exception):
  758. """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
  759. pass
  760. class InternalSubsetExtractor(ExpatBuilder):
  761. """XML processor which can rip out the internal document type subset."""
  762. subset = None
  763. def getSubset(self):
  764. """Return the internal subset as a string."""
  765. return self.subset
  766. def parseFile(self, file):
  767. try:
  768. ExpatBuilder.parseFile(self, file)
  769. except ParseEscape:
  770. pass
  771. def parseString(self, string):
  772. try:
  773. ExpatBuilder.parseString(self, string)
  774. except ParseEscape:
  775. pass
  776. def install(self, parser):
  777. parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
  778. parser.StartElementHandler = self.start_element_handler
  779. def start_doctype_decl_handler(self, name, publicId, systemId,
  780. has_internal_subset):
  781. if has_internal_subset:
  782. parser = self.getParser()
  783. self.subset = []
  784. parser.DefaultHandler = self.subset.append
  785. parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
  786. else:
  787. raise ParseEscape()
  788. def end_doctype_decl_handler(self):
  789. s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
  790. self.subset = s
  791. raise ParseEscape()
  792. def start_element_handler(self, name, attrs):
  793. raise ParseEscape()
  794. def parse(file, namespaces=True):
  795. """Parse a document, returning the resulting Document node.
  796. 'file' may be either a file name or an open file object.
  797. """
  798. if namespaces:
  799. builder = ExpatBuilderNS()
  800. else:
  801. builder = ExpatBuilder()
  802. if isinstance(file, StringTypes):
  803. fp = open(file, 'rb')
  804. try:
  805. result = builder.parseFile(fp)
  806. finally:
  807. fp.close()
  808. else:
  809. result = builder.parseFile(file)
  810. return result
  811. def parseString(string, namespaces=True):
  812. """Parse a document from a string, returning the resulting
  813. Document node.
  814. """
  815. if namespaces:
  816. builder = ExpatBuilderNS()
  817. else:
  818. builder = ExpatBuilder()
  819. return builder.parseString(string)
  820. def parseFragment(file, context, namespaces=True):
  821. """Parse a fragment of a document, given the context from which it
  822. was originally extracted. context should be the parent of the
  823. node(s) which are in the fragment.
  824. 'file' may be either a file name or an open file object.
  825. """
  826. if namespaces:
  827. builder = FragmentBuilderNS(context)
  828. else:
  829. builder = FragmentBuilder(context)
  830. if isinstance(file, StringTypes):
  831. fp = open(file, 'rb')
  832. try:
  833. result = builder.parseFile(fp)
  834. finally:
  835. fp.close()
  836. else:
  837. result = builder.parseFile(file)
  838. return result
  839. def parseFragmentString(string, context, namespaces=True):
  840. """Parse a fragment of a document from a string, given the context
  841. from which it was originally extracted. context should be the
  842. parent of the node(s) which are in the fragment.
  843. """
  844. if namespaces:
  845. builder = FragmentBuilderNS(context)
  846. else:
  847. builder = FragmentBuilder(context)
  848. return builder.parseString(string)
  849. def makeBuilder(options):
  850. """Create a builder based on an Options object."""
  851. if options.namespaces:
  852. return ExpatBuilderNS(options)
  853. else:
  854. return ExpatBuilder(options)