test_codeccallbacks.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863
  1. import test.test_support, unittest
  2. import sys, codecs, htmlentitydefs, unicodedata
  3. class PosReturn:
  4. # this can be used for configurable callbacks
  5. def __init__(self):
  6. self.pos = 0
  7. def handle(self, exc):
  8. oldpos = self.pos
  9. realpos = oldpos
  10. if realpos<0:
  11. realpos = len(exc.object) + realpos
  12. # if we don't advance this time, terminate on the next call
  13. # otherwise we'd get an endless loop
  14. if realpos <= exc.start:
  15. self.pos = len(exc.object)
  16. return (u"<?>", oldpos)
  17. # A UnicodeEncodeError object with a bad start attribute
  18. class BadStartUnicodeEncodeError(UnicodeEncodeError):
  19. def __init__(self):
  20. UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
  21. self.start = []
  22. # A UnicodeEncodeError object with a bad object attribute
  23. class BadObjectUnicodeEncodeError(UnicodeEncodeError):
  24. def __init__(self):
  25. UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
  26. self.object = []
  27. # A UnicodeDecodeError object without an end attribute
  28. class NoEndUnicodeDecodeError(UnicodeDecodeError):
  29. def __init__(self):
  30. UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
  31. del self.end
  32. # A UnicodeDecodeError object with a bad object attribute
  33. class BadObjectUnicodeDecodeError(UnicodeDecodeError):
  34. def __init__(self):
  35. UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
  36. self.object = []
  37. # A UnicodeTranslateError object without a start attribute
  38. class NoStartUnicodeTranslateError(UnicodeTranslateError):
  39. def __init__(self):
  40. UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
  41. del self.start
  42. # A UnicodeTranslateError object without an end attribute
  43. class NoEndUnicodeTranslateError(UnicodeTranslateError):
  44. def __init__(self):
  45. UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
  46. del self.end
  47. # A UnicodeTranslateError object without an object attribute
  48. class NoObjectUnicodeTranslateError(UnicodeTranslateError):
  49. def __init__(self):
  50. UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
  51. del self.object
  52. class CodecCallbackTest(unittest.TestCase):
  53. def test_xmlcharrefreplace(self):
  54. # replace unencodable characters which numeric character entities.
  55. # For ascii, latin-1 and charmaps this is completely implemented
  56. # in C and should be reasonably fast.
  57. s = u"\u30b9\u30d1\u30e2 \xe4nd egg\u0161"
  58. self.assertEqual(
  59. s.encode("ascii", "xmlcharrefreplace"),
  60. "&#12473;&#12497;&#12514; &#228;nd egg&#353;"
  61. )
  62. self.assertEqual(
  63. s.encode("latin-1", "xmlcharrefreplace"),
  64. "&#12473;&#12497;&#12514; \xe4nd egg&#353;"
  65. )
  66. self.assertEqual(
  67. s.encode("iso-8859-15", "xmlcharrefreplace"),
  68. "&#12473;&#12497;&#12514; \xe4nd egg\xa8"
  69. )
  70. def test_xmlcharrefreplace_with_surrogates(self):
  71. tests = [(u'\U0001f49d', '&#128157;'),
  72. (u'\ud83d', '&#55357;'),
  73. (u'\udc9d', '&#56477;'),
  74. ]
  75. if u'\ud83d\udc9d' != u'\U0001f49d':
  76. tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
  77. for encoding in ['ascii', 'latin1', 'iso-8859-15']:
  78. for s, exp in tests:
  79. self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'),
  80. exp, msg='%r.encode(%r)' % (s, encoding))
  81. self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'),
  82. exp+'X',
  83. msg='%r.encode(%r)' % (s + 'X', encoding))
  84. def test_xmlcharnamereplace(self):
  85. # This time use a named character entity for unencodable
  86. # characters, if one is available.
  87. def xmlcharnamereplace(exc):
  88. if not isinstance(exc, UnicodeEncodeError):
  89. raise TypeError("don't know how to handle %r" % exc)
  90. l = []
  91. for c in exc.object[exc.start:exc.end]:
  92. try:
  93. l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)])
  94. except KeyError:
  95. l.append(u"&#%d;" % ord(c))
  96. return (u"".join(l), exc.end)
  97. codecs.register_error(
  98. "test.xmlcharnamereplace", xmlcharnamereplace)
  99. sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
  100. sout = "&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
  101. self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
  102. sout = "\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
  103. self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
  104. sout = "\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
  105. self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
  106. def test_uninamereplace(self):
  107. # We're using the names from the unicode database this time,
  108. # and we're doing "syntax highlighting" here, i.e. we include
  109. # the replaced text in ANSI escape sequences. For this it is
  110. # useful that the error handler is not called for every single
  111. # unencodable character, but for a complete sequence of
  112. # unencodable characters, otherwise we would output many
  113. # unnecessary escape sequences.
  114. def uninamereplace(exc):
  115. if not isinstance(exc, UnicodeEncodeError):
  116. raise TypeError("don't know how to handle %r" % exc)
  117. l = []
  118. for c in exc.object[exc.start:exc.end]:
  119. l.append(unicodedata.name(c, u"0x%x" % ord(c)))
  120. return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
  121. codecs.register_error(
  122. "test.uninamereplace", uninamereplace)
  123. sin = u"\xac\u1234\u20ac\u8000"
  124. sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
  125. self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
  126. sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
  127. self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
  128. sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
  129. self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
  130. def test_backslashescape(self):
  131. # Does the same as the "unicode-escape" encoding, but with different
  132. # base encodings.
  133. sin = u"a\xac\u1234\u20ac\u8000"
  134. if sys.maxunicode > 0xffff:
  135. sin += unichr(sys.maxunicode)
  136. sout = "a\\xac\\u1234\\u20ac\\u8000"
  137. if sys.maxunicode > 0xffff:
  138. sout += "\\U%08x" % sys.maxunicode
  139. self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
  140. sout = "a\xac\\u1234\\u20ac\\u8000"
  141. if sys.maxunicode > 0xffff:
  142. sout += "\\U%08x" % sys.maxunicode
  143. self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
  144. sout = "a\xac\\u1234\xa4\\u8000"
  145. if sys.maxunicode > 0xffff:
  146. sout += "\\U%08x" % sys.maxunicode
  147. self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
  148. def test_decoding_callbacks(self):
  149. # This is a test for a decoding callback handler
  150. # that allows the decoding of the invalid sequence
  151. # "\xc0\x80" and returns "\x00" instead of raising an error.
  152. # All other illegal sequences will be handled strictly.
  153. def relaxedutf8(exc):
  154. if not isinstance(exc, UnicodeDecodeError):
  155. raise TypeError("don't know how to handle %r" % exc)
  156. if exc.object[exc.start:exc.start+2] == "\xc0\x80":
  157. return (u"\x00", exc.start+2) # retry after two bytes
  158. else:
  159. raise exc
  160. codecs.register_error("test.relaxedutf8", relaxedutf8)
  161. # all the "\xc0\x80" will be decoded to "\x00"
  162. sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
  163. sout = u"a\x00b\x00c\xfc\x00\x00"
  164. self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
  165. # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
  166. sin = "\xc0\x80\xc0\x81"
  167. self.assertRaises(UnicodeDecodeError, sin.decode,
  168. "utf-8", "test.relaxedutf8")
  169. def test_charmapencode(self):
  170. # For charmap encodings the replacement string will be
  171. # mapped through the encoding again. This means, that
  172. # to be able to use e.g. the "replace" handler, the
  173. # charmap has to have a mapping for "?".
  174. charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
  175. sin = u"abc"
  176. sout = "AABBCC"
  177. self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
  178. sin = u"abcA"
  179. self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
  180. charmap[ord("?")] = "XYZ"
  181. sin = u"abcDEF"
  182. sout = "AABBCCXYZXYZXYZ"
  183. self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
  184. charmap[ord("?")] = u"XYZ"
  185. self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
  186. charmap[ord("?")] = u"XYZ"
  187. self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
  188. def test_decodeunicodeinternal(self):
  189. self.assertRaises(
  190. UnicodeDecodeError,
  191. "\x00\x00\x00\x00\x00".decode,
  192. "unicode-internal",
  193. )
  194. if sys.maxunicode > 0xffff:
  195. def handler_unicodeinternal(exc):
  196. if not isinstance(exc, UnicodeDecodeError):
  197. raise TypeError("don't know how to handle %r" % exc)
  198. return (u"\x01", 1)
  199. self.assertEqual(
  200. "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
  201. u"\u0000"
  202. )
  203. self.assertEqual(
  204. "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
  205. u"\u0000\ufffd"
  206. )
  207. codecs.register_error("test.hui", handler_unicodeinternal)
  208. self.assertEqual(
  209. "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
  210. u"\u0000\u0001\u0000"
  211. )
  212. def test_callbacks(self):
  213. def handler1(exc):
  214. if not isinstance(exc, UnicodeEncodeError) \
  215. and not isinstance(exc, UnicodeDecodeError):
  216. raise TypeError("don't know how to handle %r" % exc)
  217. l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
  218. return (u"[%s]" % u"".join(l), exc.end)
  219. codecs.register_error("test.handler1", handler1)
  220. def handler2(exc):
  221. if not isinstance(exc, UnicodeDecodeError):
  222. raise TypeError("don't know how to handle %r" % exc)
  223. l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
  224. return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
  225. codecs.register_error("test.handler2", handler2)
  226. s = "\x00\x81\x7f\x80\xff"
  227. self.assertEqual(
  228. s.decode("ascii", "test.handler1"),
  229. u"\x00[<129>]\x7f[<128>][<255>]"
  230. )
  231. self.assertEqual(
  232. s.decode("ascii", "test.handler2"),
  233. u"\x00[<129>][<128>]"
  234. )
  235. self.assertEqual(
  236. "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
  237. u"\u3042[<92><117><51>]xxx"
  238. )
  239. self.assertEqual(
  240. "\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
  241. u"\u3042[<92><117><51>]xx"
  242. )
  243. self.assertEqual(
  244. codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
  245. u"z[<98>][<99>]"
  246. )
  247. self.assertEqual(
  248. u"g\xfc\xdfrk".encode("ascii", "test.handler1"),
  249. u"g[<252><223>]rk"
  250. )
  251. self.assertEqual(
  252. u"g\xfc\xdf".encode("ascii", "test.handler1"),
  253. u"g[<252><223>]"
  254. )
  255. def test_longstrings(self):
  256. # test long strings to check for memory overflow problems
  257. errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
  258. "backslashreplace"]
  259. # register the handlers under different names,
  260. # to prevent the codec from recognizing the name
  261. for err in errors:
  262. codecs.register_error("test." + err, codecs.lookup_error(err))
  263. l = 1000
  264. errors += [ "test." + err for err in errors ]
  265. for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
  266. for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
  267. "utf-8", "utf-7", "utf-16", "utf-32"):
  268. for err in errors:
  269. try:
  270. uni.encode(enc, err)
  271. except UnicodeError:
  272. pass
  273. def check_exceptionobjectargs(self, exctype, args, msg):
  274. # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
  275. # check with one missing argument
  276. self.assertRaises(TypeError, exctype, *args[:-1])
  277. # check with one argument too much
  278. self.assertRaises(TypeError, exctype, *(args + ["too much"]))
  279. # check with one argument of the wrong type
  280. wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
  281. for i in xrange(len(args)):
  282. for wrongarg in wrongargs:
  283. if type(wrongarg) is type(args[i]):
  284. continue
  285. # build argument array
  286. callargs = []
  287. for j in xrange(len(args)):
  288. if i==j:
  289. callargs.append(wrongarg)
  290. else:
  291. callargs.append(args[i])
  292. self.assertRaises(TypeError, exctype, *callargs)
  293. # check with the correct number and type of arguments
  294. exc = exctype(*args)
  295. self.assertEqual(str(exc), msg)
  296. def test_unicodeencodeerror(self):
  297. self.check_exceptionobjectargs(
  298. UnicodeEncodeError,
  299. ["ascii", u"g\xfcrk", 1, 2, "ouch"],
  300. "'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
  301. )
  302. self.check_exceptionobjectargs(
  303. UnicodeEncodeError,
  304. ["ascii", u"g\xfcrk", 1, 4, "ouch"],
  305. "'ascii' codec can't encode characters in position 1-3: ouch"
  306. )
  307. self.check_exceptionobjectargs(
  308. UnicodeEncodeError,
  309. ["ascii", u"\xfcx", 0, 1, "ouch"],
  310. "'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
  311. )
  312. self.check_exceptionobjectargs(
  313. UnicodeEncodeError,
  314. ["ascii", u"\u0100x", 0, 1, "ouch"],
  315. "'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
  316. )
  317. self.check_exceptionobjectargs(
  318. UnicodeEncodeError,
  319. ["ascii", u"\uffffx", 0, 1, "ouch"],
  320. "'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
  321. )
  322. if sys.maxunicode > 0xffff:
  323. self.check_exceptionobjectargs(
  324. UnicodeEncodeError,
  325. ["ascii", u"\U00010000x", 0, 1, "ouch"],
  326. "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
  327. )
  328. def test_unicodedecodeerror(self):
  329. self.check_exceptionobjectargs(
  330. UnicodeDecodeError,
  331. ["ascii", "g\xfcrk", 1, 2, "ouch"],
  332. "'ascii' codec can't decode byte 0xfc in position 1: ouch"
  333. )
  334. self.check_exceptionobjectargs(
  335. UnicodeDecodeError,
  336. ["ascii", "g\xfcrk", 1, 3, "ouch"],
  337. "'ascii' codec can't decode bytes in position 1-2: ouch"
  338. )
  339. def test_unicodetranslateerror(self):
  340. self.check_exceptionobjectargs(
  341. UnicodeTranslateError,
  342. [u"g\xfcrk", 1, 2, "ouch"],
  343. "can't translate character u'\\xfc' in position 1: ouch"
  344. )
  345. self.check_exceptionobjectargs(
  346. UnicodeTranslateError,
  347. [u"g\u0100rk", 1, 2, "ouch"],
  348. "can't translate character u'\\u0100' in position 1: ouch"
  349. )
  350. self.check_exceptionobjectargs(
  351. UnicodeTranslateError,
  352. [u"g\uffffrk", 1, 2, "ouch"],
  353. "can't translate character u'\\uffff' in position 1: ouch"
  354. )
  355. if sys.maxunicode > 0xffff:
  356. self.check_exceptionobjectargs(
  357. UnicodeTranslateError,
  358. [u"g\U00010000rk", 1, 2, "ouch"],
  359. "can't translate character u'\\U00010000' in position 1: ouch"
  360. )
  361. self.check_exceptionobjectargs(
  362. UnicodeTranslateError,
  363. [u"g\xfcrk", 1, 3, "ouch"],
  364. "can't translate characters in position 1-2: ouch"
  365. )
  366. def test_badandgoodstrictexceptions(self):
  367. # "strict" complains about a non-exception passed in
  368. self.assertRaises(
  369. TypeError,
  370. codecs.strict_errors,
  371. 42
  372. )
  373. # "strict" complains about the wrong exception type
  374. self.assertRaises(
  375. Exception,
  376. codecs.strict_errors,
  377. Exception("ouch")
  378. )
  379. # If the correct exception is passed in, "strict" raises it
  380. self.assertRaises(
  381. UnicodeEncodeError,
  382. codecs.strict_errors,
  383. UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
  384. )
  385. self.assertRaises(
  386. UnicodeDecodeError,
  387. codecs.strict_errors,
  388. UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
  389. )
  390. self.assertRaises(
  391. UnicodeTranslateError,
  392. codecs.strict_errors,
  393. UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
  394. )
  395. def test_badandgoodignoreexceptions(self):
  396. # "ignore" complains about a non-exception passed in
  397. self.assertRaises(
  398. TypeError,
  399. codecs.ignore_errors,
  400. 42
  401. )
  402. # "ignore" complains about the wrong exception type
  403. self.assertRaises(
  404. TypeError,
  405. codecs.ignore_errors,
  406. UnicodeError("ouch")
  407. )
  408. # If the correct exception is passed in, "ignore" returns an empty replacement
  409. self.assertEqual(
  410. codecs.ignore_errors(
  411. UnicodeEncodeError("ascii", u"a\u3042b", 1, 2, "ouch")),
  412. (u"", 2)
  413. )
  414. self.assertEqual(
  415. codecs.ignore_errors(
  416. UnicodeDecodeError("ascii", "a\xffb", 1, 2, "ouch")),
  417. (u"", 2)
  418. )
  419. self.assertEqual(
  420. codecs.ignore_errors(
  421. UnicodeTranslateError(u"a\u3042b", 1, 2, "ouch")),
  422. (u"", 2)
  423. )
  424. def test_badandgoodreplaceexceptions(self):
  425. # "replace" complains about a non-exception passed in
  426. self.assertRaises(
  427. TypeError,
  428. codecs.replace_errors,
  429. 42
  430. )
  431. # "replace" complains about the wrong exception type
  432. self.assertRaises(
  433. TypeError,
  434. codecs.replace_errors,
  435. UnicodeError("ouch")
  436. )
  437. self.assertRaises(
  438. TypeError,
  439. codecs.replace_errors,
  440. BadObjectUnicodeEncodeError()
  441. )
  442. self.assertRaises(
  443. TypeError,
  444. codecs.replace_errors,
  445. BadObjectUnicodeDecodeError()
  446. )
  447. # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
  448. self.assertEqual(
  449. codecs.replace_errors(
  450. UnicodeEncodeError("ascii", u"a\u3042b", 1, 2, "ouch")),
  451. (u"?", 2)
  452. )
  453. self.assertEqual(
  454. codecs.replace_errors(
  455. UnicodeDecodeError("ascii", "a\xffb", 1, 2, "ouch")),
  456. (u"\ufffd", 2)
  457. )
  458. self.assertEqual(
  459. codecs.replace_errors(
  460. UnicodeTranslateError(u"a\u3042b", 1, 2, "ouch")),
  461. (u"\ufffd", 2)
  462. )
  463. def test_badandgoodxmlcharrefreplaceexceptions(self):
  464. # "xmlcharrefreplace" complains about a non-exception passed in
  465. self.assertRaises(
  466. TypeError,
  467. codecs.xmlcharrefreplace_errors,
  468. 42
  469. )
  470. # "xmlcharrefreplace" complains about the wrong exception types
  471. self.assertRaises(
  472. TypeError,
  473. codecs.xmlcharrefreplace_errors,
  474. UnicodeError("ouch")
  475. )
  476. # "xmlcharrefreplace" can only be used for encoding
  477. self.assertRaises(
  478. TypeError,
  479. codecs.xmlcharrefreplace_errors,
  480. UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
  481. )
  482. self.assertRaises(
  483. TypeError,
  484. codecs.xmlcharrefreplace_errors,
  485. UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
  486. )
  487. # Use the correct exception
  488. cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000)
  489. cs += (0xdfff, 0xd800)
  490. s = u"".join(unichr(c) for c in cs)
  491. s += u"\U0001869f\U000186a0\U000f423f\U000f4240"
  492. cs += (99999, 100000, 999999, 1000000)
  493. self.assertEqual(
  494. codecs.xmlcharrefreplace_errors(
  495. UnicodeEncodeError("ascii", u"a" + s + u"b",
  496. 1, 1 + len(s), "ouch")
  497. ),
  498. (u"".join(u"&#%d;" % c for c in cs), 1 + len(s))
  499. )
  500. def test_badandgoodbackslashreplaceexceptions(self):
  501. # "backslashreplace" complains about a non-exception passed in
  502. self.assertRaises(
  503. TypeError,
  504. codecs.backslashreplace_errors,
  505. 42
  506. )
  507. # "backslashreplace" complains about the wrong exception types
  508. self.assertRaises(
  509. TypeError,
  510. codecs.backslashreplace_errors,
  511. UnicodeError("ouch")
  512. )
  513. # "backslashreplace" can only be used for encoding
  514. self.assertRaises(
  515. TypeError,
  516. codecs.backslashreplace_errors,
  517. UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
  518. )
  519. self.assertRaises(
  520. TypeError,
  521. codecs.backslashreplace_errors,
  522. UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
  523. )
  524. # Use the correct exception
  525. tests = [
  526. (u"\u3042", u"\\u3042"),
  527. (u"\n", u"\\x0a"),
  528. (u"a", u"\\x61"),
  529. (u"\x00", u"\\x00"),
  530. (u"\xff", u"\\xff"),
  531. (u"\u0100", u"\\u0100"),
  532. (u"\uffff", u"\\uffff"),
  533. # Lone surrogates
  534. (u"\ud800", u"\\ud800"),
  535. (u"\udfff", u"\\udfff"),
  536. ]
  537. if sys.maxunicode > 0xffff:
  538. tests += [
  539. (u"\U00010000", u"\\U00010000"),
  540. (u"\U0010ffff", u"\\U0010ffff"),
  541. ]
  542. else:
  543. tests += [
  544. (u"\U00010000", u"\\ud800\\udc00"),
  545. (u"\U0010ffff", u"\\udbff\\udfff"),
  546. ]
  547. for s, r in tests:
  548. self.assertEqual(
  549. codecs.backslashreplace_errors(
  550. UnicodeEncodeError("ascii", u"a" + s + u"b",
  551. 1, 1 + len(s), "ouch")),
  552. (r, 1 + len(s))
  553. )
  554. def test_badhandlerresults(self):
  555. results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
  556. encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
  557. for res in results:
  558. codecs.register_error("test.badhandler", lambda x: res)
  559. for enc in encs:
  560. self.assertRaises(
  561. TypeError,
  562. u"\u3042".encode,
  563. enc,
  564. "test.badhandler"
  565. )
  566. for (enc, bytes) in (
  567. ("ascii", "\xff"),
  568. ("utf-8", "\xff"),
  569. ("utf-7", "+x-"),
  570. ("unicode-internal", "\x00"),
  571. ):
  572. self.assertRaises(
  573. TypeError,
  574. bytes.decode,
  575. enc,
  576. "test.badhandler"
  577. )
  578. def test_lookup(self):
  579. self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
  580. self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
  581. self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
  582. self.assertEqual(
  583. codecs.xmlcharrefreplace_errors,
  584. codecs.lookup_error("xmlcharrefreplace")
  585. )
  586. self.assertEqual(
  587. codecs.backslashreplace_errors,
  588. codecs.lookup_error("backslashreplace")
  589. )
  590. def test_unencodablereplacement(self):
  591. def unencrepl(exc):
  592. if isinstance(exc, UnicodeEncodeError):
  593. return (u"\u4242", exc.end)
  594. else:
  595. raise TypeError("don't know how to handle %r" % exc)
  596. codecs.register_error("test.unencreplhandler", unencrepl)
  597. for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
  598. self.assertRaises(
  599. UnicodeEncodeError,
  600. u"\u4242".encode,
  601. enc,
  602. "test.unencreplhandler"
  603. )
  604. def test_badregistercall(self):
  605. # enhance coverage of:
  606. # Modules/_codecsmodule.c::register_error()
  607. # Python/codecs.c::PyCodec_RegisterError()
  608. self.assertRaises(TypeError, codecs.register_error, 42)
  609. self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
  610. def test_badlookupcall(self):
  611. # enhance coverage of:
  612. # Modules/_codecsmodule.c::lookup_error()
  613. self.assertRaises(TypeError, codecs.lookup_error)
  614. def test_unknownhandler(self):
  615. # enhance coverage of:
  616. # Modules/_codecsmodule.c::lookup_error()
  617. self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
  618. def test_xmlcharrefvalues(self):
  619. # enhance coverage of:
  620. # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
  621. # and inline implementations
  622. v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000)
  623. if sys.maxunicode>=100000:
  624. v += (100000, 500000, 1000000)
  625. s = u"".join([unichr(x) for x in v])
  626. codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
  627. for enc in ("ascii", "iso-8859-15"):
  628. for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
  629. s.encode(enc, err)
  630. def test_decodehelper(self):
  631. # enhance coverage of:
  632. # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
  633. # and callers
  634. self.assertRaises(LookupError, "\xff".decode, "ascii", "test.unknown")
  635. def baddecodereturn1(exc):
  636. return 42
  637. codecs.register_error("test.baddecodereturn1", baddecodereturn1)
  638. self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1")
  639. self.assertRaises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1")
  640. self.assertRaises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1")
  641. self.assertRaises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
  642. self.assertRaises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
  643. self.assertRaises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
  644. def baddecodereturn2(exc):
  645. return (u"?", None)
  646. codecs.register_error("test.baddecodereturn2", baddecodereturn2)
  647. self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2")
  648. handler = PosReturn()
  649. codecs.register_error("test.posreturn", handler.handle)
  650. # Valid negative position
  651. handler.pos = -1
  652. self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
  653. # Valid negative position
  654. handler.pos = -2
  655. self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?><?>")
  656. # Negative position out of bounds
  657. handler.pos = -3
  658. self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
  659. # Valid positive position
  660. handler.pos = 1
  661. self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
  662. # Largest valid positive position (one beyond end of input)
  663. handler.pos = 2
  664. self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>")
  665. # Invalid positive position
  666. handler.pos = 3
  667. self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
  668. # Restart at the "0"
  669. handler.pos = 6
  670. self.assertEqual("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0")
  671. class D(dict):
  672. def __getitem__(self, key):
  673. raise ValueError
  674. self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None})
  675. self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D())
  676. self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: 0x110000})
  677. def test_encodehelper(self):
  678. # enhance coverage of:
  679. # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
  680. # and callers
  681. self.assertRaises(LookupError, u"\xff".encode, "ascii", "test.unknown")
  682. def badencodereturn1(exc):
  683. return 42
  684. codecs.register_error("test.badencodereturn1", badencodereturn1)
  685. self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn1")
  686. def badencodereturn2(exc):
  687. return (u"?", None)
  688. codecs.register_error("test.badencodereturn2", badencodereturn2)
  689. self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2")
  690. handler = PosReturn()
  691. codecs.register_error("test.posreturn", handler.handle)
  692. # Valid negative position
  693. handler.pos = -1
  694. self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
  695. # Valid negative position
  696. handler.pos = -2
  697. self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
  698. # Negative position out of bounds
  699. handler.pos = -3
  700. self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
  701. # Valid positive position
  702. handler.pos = 1
  703. self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
  704. # Largest valid positive position (one beyond end of input
  705. handler.pos = 2
  706. self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
  707. # Invalid positive position
  708. handler.pos = 3
  709. self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
  710. handler.pos = 0
  711. class D(dict):
  712. def __getitem__(self, key):
  713. raise ValueError
  714. for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
  715. self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
  716. self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
  717. self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
  718. def test_translatehelper(self):
  719. # enhance coverage of:
  720. # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
  721. # and callers
  722. # (Unfortunately the errors argument is not directly accessible
  723. # from Python, so we can't test that much)
  724. class D(dict):
  725. def __getitem__(self, key):
  726. raise ValueError
  727. self.assertRaises(ValueError, u"\xff".translate, D())
  728. self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1})
  729. self.assertRaises(TypeError, u"\xff".translate, {0xff: ()})
  730. def test_bug828737(self):
  731. charmap = {
  732. ord("&"): u"&amp;",
  733. ord("<"): u"&lt;",
  734. ord(">"): u"&gt;",
  735. ord('"'): u"&quot;",
  736. }
  737. for n in (1, 10, 100, 1000):
  738. text = u'abc<def>ghi'*n
  739. text.translate(charmap)
  740. def test_fake_error_class(self):
  741. handlers = [
  742. codecs.strict_errors,
  743. codecs.ignore_errors,
  744. codecs.replace_errors,
  745. codecs.backslashreplace_errors,
  746. codecs.xmlcharrefreplace_errors,
  747. ]
  748. for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
  749. class FakeUnicodeError(str):
  750. __class__ = cls
  751. for handler in handlers:
  752. self.assertRaises(TypeError, handler, FakeUnicodeError())
  753. class FakeUnicodeError(Exception):
  754. __class__ = cls
  755. for handler in handlers:
  756. with self.assertRaises((TypeError, FakeUnicodeError)):
  757. handler(FakeUnicodeError())
  758. def test_main():
  759. test.test_support.run_unittest(CodecCallbackTest)
  760. if __name__ == "__main__":
  761. test_main()