expatreader.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. """
  2. SAX driver for the pyexpat C module. This driver works with
  3. pyexpat.__version__ == '2.22'.
  4. """
  5. version = "0.20"
  6. from xml.sax._exceptions import *
  7. from xml.sax.handler import feature_validation, feature_namespaces
  8. from xml.sax.handler import feature_namespace_prefixes
  9. from xml.sax.handler import feature_external_ges, feature_external_pes
  10. from xml.sax.handler import feature_string_interning
  11. from xml.sax.handler import property_xml_string, property_interning_dict
  12. # xml.parsers.expat does not raise ImportError in Jython
  13. import sys
  14. if sys.platform[:4] == "java":
  15. raise SAXReaderNotAvailable("expat not available in Java", None)
  16. del sys
  17. try:
  18. from xml.parsers import expat
  19. except ImportError:
  20. raise SAXReaderNotAvailable("expat not supported", None)
  21. else:
  22. if not hasattr(expat, "ParserCreate"):
  23. raise SAXReaderNotAvailable("expat not supported", None)
  24. from xml.sax import xmlreader, saxutils, handler
  25. AttributesImpl = xmlreader.AttributesImpl
  26. AttributesNSImpl = xmlreader.AttributesNSImpl
  27. # If we're using a sufficiently recent version of Python, we can use
  28. # weak references to avoid cycles between the parser and content
  29. # handler, otherwise we'll just have to pretend.
  30. try:
  31. import _weakref
  32. except ImportError:
  33. def _mkproxy(o):
  34. return o
  35. else:
  36. import weakref
  37. _mkproxy = weakref.proxy
  38. del weakref, _weakref
  39. class _ClosedParser:
  40. pass
  41. # --- ExpatLocator
  42. class ExpatLocator(xmlreader.Locator):
  43. """Locator for use with the ExpatParser class.
  44. This uses a weak reference to the parser object to avoid creating
  45. a circular reference between the parser and the content handler.
  46. """
  47. def __init__(self, parser):
  48. self._ref = _mkproxy(parser)
  49. def getColumnNumber(self):
  50. parser = self._ref
  51. if parser._parser is None:
  52. return None
  53. return parser._parser.ErrorColumnNumber
  54. def getLineNumber(self):
  55. parser = self._ref
  56. if parser._parser is None:
  57. return 1
  58. return parser._parser.ErrorLineNumber
  59. def getPublicId(self):
  60. parser = self._ref
  61. if parser is None:
  62. return None
  63. return parser._source.getPublicId()
  64. def getSystemId(self):
  65. parser = self._ref
  66. if parser is None:
  67. return None
  68. return parser._source.getSystemId()
  69. # --- ExpatParser
  70. class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
  71. """SAX driver for the pyexpat C module."""
  72. def __init__(self, namespaceHandling=0, bufsize=2**16-20):
  73. xmlreader.IncrementalParser.__init__(self, bufsize)
  74. self._source = xmlreader.InputSource()
  75. self._parser = None
  76. self._namespaces = namespaceHandling
  77. self._lex_handler_prop = None
  78. self._parsing = 0
  79. self._entity_stack = []
  80. self._external_ges = 1
  81. self._interning = None
  82. # XMLReader methods
  83. def parse(self, source):
  84. "Parse an XML document from a URL or an InputSource."
  85. source = saxutils.prepare_input_source(source)
  86. self._source = source
  87. self.reset()
  88. self._cont_handler.setDocumentLocator(ExpatLocator(self))
  89. xmlreader.IncrementalParser.parse(self, source)
  90. def prepareParser(self, source):
  91. if source.getSystemId() is not None:
  92. base = source.getSystemId()
  93. if isinstance(base, unicode):
  94. base = base.encode('utf-8')
  95. self._parser.SetBase(base)
  96. # Redefined setContentHandler to allow changing handlers during parsing
  97. def setContentHandler(self, handler):
  98. xmlreader.IncrementalParser.setContentHandler(self, handler)
  99. if self._parsing:
  100. self._reset_cont_handler()
  101. def getFeature(self, name):
  102. if name == feature_namespaces:
  103. return self._namespaces
  104. elif name == feature_string_interning:
  105. return self._interning is not None
  106. elif name in (feature_validation, feature_external_pes,
  107. feature_namespace_prefixes):
  108. return 0
  109. elif name == feature_external_ges:
  110. return self._external_ges
  111. raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
  112. def setFeature(self, name, state):
  113. if self._parsing:
  114. raise SAXNotSupportedException("Cannot set features while parsing")
  115. if name == feature_namespaces:
  116. self._namespaces = state
  117. elif name == feature_external_ges:
  118. self._external_ges = state
  119. elif name == feature_string_interning:
  120. if state:
  121. if self._interning is None:
  122. self._interning = {}
  123. else:
  124. self._interning = None
  125. elif name == feature_validation:
  126. if state:
  127. raise SAXNotSupportedException(
  128. "expat does not support validation")
  129. elif name == feature_external_pes:
  130. if state:
  131. raise SAXNotSupportedException(
  132. "expat does not read external parameter entities")
  133. elif name == feature_namespace_prefixes:
  134. if state:
  135. raise SAXNotSupportedException(
  136. "expat does not report namespace prefixes")
  137. else:
  138. raise SAXNotRecognizedException(
  139. "Feature '%s' not recognized" % name)
  140. def getProperty(self, name):
  141. if name == handler.property_lexical_handler:
  142. return self._lex_handler_prop
  143. elif name == property_interning_dict:
  144. return self._interning
  145. elif name == property_xml_string:
  146. if self._parser:
  147. if hasattr(self._parser, "GetInputContext"):
  148. return self._parser.GetInputContext()
  149. else:
  150. raise SAXNotRecognizedException(
  151. "This version of expat does not support getting"
  152. " the XML string")
  153. else:
  154. raise SAXNotSupportedException(
  155. "XML string cannot be returned when not parsing")
  156. raise SAXNotRecognizedException("Property '%s' not recognized" % name)
  157. def setProperty(self, name, value):
  158. if name == handler.property_lexical_handler:
  159. self._lex_handler_prop = value
  160. if self._parsing:
  161. self._reset_lex_handler_prop()
  162. elif name == property_interning_dict:
  163. self._interning = value
  164. elif name == property_xml_string:
  165. raise SAXNotSupportedException("Property '%s' cannot be set" %
  166. name)
  167. else:
  168. raise SAXNotRecognizedException("Property '%s' not recognized" %
  169. name)
  170. # IncrementalParser methods
  171. def feed(self, data, isFinal = 0):
  172. if not self._parsing:
  173. self.reset()
  174. self._parsing = 1
  175. self._cont_handler.startDocument()
  176. try:
  177. # The isFinal parameter is internal to the expat reader.
  178. # If it is set to true, expat will check validity of the entire
  179. # document. When feeding chunks, they are not normally final -
  180. # except when invoked from close.
  181. self._parser.Parse(data, isFinal)
  182. except expat.error, e:
  183. exc = SAXParseException(expat.ErrorString(e.code), e, self)
  184. # FIXME: when to invoke error()?
  185. self._err_handler.fatalError(exc)
  186. def close(self):
  187. if (self._entity_stack or self._parser is None or
  188. isinstance(self._parser, _ClosedParser)):
  189. # If we are completing an external entity, do nothing here
  190. return
  191. try:
  192. self.feed("", isFinal = 1)
  193. self._cont_handler.endDocument()
  194. self._parsing = 0
  195. # break cycle created by expat handlers pointing to our methods
  196. self._parser = None
  197. finally:
  198. self._parsing = 0
  199. if self._parser is not None:
  200. # Keep ErrorColumnNumber and ErrorLineNumber after closing.
  201. parser = _ClosedParser()
  202. parser.ErrorColumnNumber = self._parser.ErrorColumnNumber
  203. parser.ErrorLineNumber = self._parser.ErrorLineNumber
  204. self._parser = parser
  205. def _reset_cont_handler(self):
  206. self._parser.ProcessingInstructionHandler = \
  207. self._cont_handler.processingInstruction
  208. self._parser.CharacterDataHandler = self._cont_handler.characters
  209. def _reset_lex_handler_prop(self):
  210. lex = self._lex_handler_prop
  211. parser = self._parser
  212. if lex is None:
  213. parser.CommentHandler = None
  214. parser.StartCdataSectionHandler = None
  215. parser.EndCdataSectionHandler = None
  216. parser.StartDoctypeDeclHandler = None
  217. parser.EndDoctypeDeclHandler = None
  218. else:
  219. parser.CommentHandler = lex.comment
  220. parser.StartCdataSectionHandler = lex.startCDATA
  221. parser.EndCdataSectionHandler = lex.endCDATA
  222. parser.StartDoctypeDeclHandler = self.start_doctype_decl
  223. parser.EndDoctypeDeclHandler = lex.endDTD
  224. def reset(self):
  225. if self._namespaces:
  226. self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
  227. intern=self._interning)
  228. self._parser.namespace_prefixes = 1
  229. self._parser.StartElementHandler = self.start_element_ns
  230. self._parser.EndElementHandler = self.end_element_ns
  231. else:
  232. self._parser = expat.ParserCreate(self._source.getEncoding(),
  233. intern = self._interning)
  234. self._parser.StartElementHandler = self.start_element
  235. self._parser.EndElementHandler = self.end_element
  236. self._reset_cont_handler()
  237. self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
  238. self._parser.NotationDeclHandler = self.notation_decl
  239. self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
  240. self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
  241. self._decl_handler_prop = None
  242. if self._lex_handler_prop:
  243. self._reset_lex_handler_prop()
  244. # self._parser.DefaultHandler =
  245. # self._parser.DefaultHandlerExpand =
  246. # self._parser.NotStandaloneHandler =
  247. self._parser.ExternalEntityRefHandler = self.external_entity_ref
  248. try:
  249. self._parser.SkippedEntityHandler = self.skipped_entity_handler
  250. except AttributeError:
  251. # This pyexpat does not support SkippedEntity
  252. pass
  253. self._parser.SetParamEntityParsing(
  254. expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
  255. self._parsing = 0
  256. self._entity_stack = []
  257. # Locator methods
  258. def getColumnNumber(self):
  259. if self._parser is None:
  260. return None
  261. return self._parser.ErrorColumnNumber
  262. def getLineNumber(self):
  263. if self._parser is None:
  264. return 1
  265. return self._parser.ErrorLineNumber
  266. def getPublicId(self):
  267. return self._source.getPublicId()
  268. def getSystemId(self):
  269. return self._source.getSystemId()
  270. # event handlers
  271. def start_element(self, name, attrs):
  272. self._cont_handler.startElement(name, AttributesImpl(attrs))
  273. def end_element(self, name):
  274. self._cont_handler.endElement(name)
  275. def start_element_ns(self, name, attrs):
  276. pair = name.split()
  277. if len(pair) == 1:
  278. # no namespace
  279. pair = (None, name)
  280. elif len(pair) == 3:
  281. pair = pair[0], pair[1]
  282. else:
  283. # default namespace
  284. pair = tuple(pair)
  285. newattrs = {}
  286. qnames = {}
  287. for (aname, value) in attrs.items():
  288. parts = aname.split()
  289. length = len(parts)
  290. if length == 1:
  291. # no namespace
  292. qname = aname
  293. apair = (None, aname)
  294. elif length == 3:
  295. qname = "%s:%s" % (parts[2], parts[1])
  296. apair = parts[0], parts[1]
  297. else:
  298. # default namespace
  299. qname = parts[1]
  300. apair = tuple(parts)
  301. newattrs[apair] = value
  302. qnames[apair] = qname
  303. self._cont_handler.startElementNS(pair, None,
  304. AttributesNSImpl(newattrs, qnames))
  305. def end_element_ns(self, name):
  306. pair = name.split()
  307. if len(pair) == 1:
  308. pair = (None, name)
  309. elif len(pair) == 3:
  310. pair = pair[0], pair[1]
  311. else:
  312. pair = tuple(pair)
  313. self._cont_handler.endElementNS(pair, None)
  314. # this is not used (call directly to ContentHandler)
  315. def processing_instruction(self, target, data):
  316. self._cont_handler.processingInstruction(target, data)
  317. # this is not used (call directly to ContentHandler)
  318. def character_data(self, data):
  319. self._cont_handler.characters(data)
  320. def start_namespace_decl(self, prefix, uri):
  321. self._cont_handler.startPrefixMapping(prefix, uri)
  322. def end_namespace_decl(self, prefix):
  323. self._cont_handler.endPrefixMapping(prefix)
  324. def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
  325. self._lex_handler_prop.startDTD(name, pubid, sysid)
  326. def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
  327. self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
  328. def notation_decl(self, name, base, sysid, pubid):
  329. self._dtd_handler.notationDecl(name, pubid, sysid)
  330. def external_entity_ref(self, context, base, sysid, pubid):
  331. if not self._external_ges:
  332. return 1
  333. source = self._ent_handler.resolveEntity(pubid, sysid)
  334. source = saxutils.prepare_input_source(source,
  335. self._source.getSystemId() or
  336. "")
  337. self._entity_stack.append((self._parser, self._source))
  338. self._parser = self._parser.ExternalEntityParserCreate(context)
  339. self._source = source
  340. try:
  341. xmlreader.IncrementalParser.parse(self, source)
  342. except:
  343. return 0 # FIXME: save error info here?
  344. (self._parser, self._source) = self._entity_stack[-1]
  345. del self._entity_stack[-1]
  346. return 1
  347. def skipped_entity_handler(self, name, is_pe):
  348. if is_pe:
  349. # The SAX spec requires to report skipped PEs with a '%'
  350. name = '%'+name
  351. self._cont_handler.skippedEntity(name)
  352. # ---
  353. def create_parser(*args, **kwargs):
  354. return ExpatParser(*args, **kwargs)
  355. # ---
  356. if __name__ == "__main__":
  357. import xml.sax.saxutils
  358. p = create_parser()
  359. p.setContentHandler(xml.sax.saxutils.XMLGenerator())
  360. p.setErrorHandler(xml.sax.ErrorHandler())
  361. p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")