test_codecs.py 79 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230
  1. from test import test_support
  2. import unittest
  3. import codecs
  4. import locale
  5. import sys, StringIO
  6. def coding_checker(self, coder):
  7. def check(input, expect):
  8. self.assertEqual(coder(input), (expect, len(input)))
  9. return check
  10. class Queue(object):
  11. """
  12. queue: write bytes at one end, read bytes from the other end
  13. """
  14. def __init__(self):
  15. self._buffer = ""
  16. def write(self, chars):
  17. self._buffer += chars
  18. def read(self, size=-1):
  19. if size<0:
  20. s = self._buffer
  21. self._buffer = ""
  22. return s
  23. else:
  24. s = self._buffer[:size]
  25. self._buffer = self._buffer[size:]
  26. return s
  27. class ReadTest(unittest.TestCase):
  28. def check_partial(self, input, partialresults):
  29. # get a StreamReader for the encoding and feed the bytestring version
  30. # of input to the reader byte by byte. Read everything available from
  31. # the StreamReader and check that the results equal the appropriate
  32. # entries from partialresults.
  33. q = Queue()
  34. r = codecs.getreader(self.encoding)(q)
  35. result = u""
  36. for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  37. q.write(c)
  38. result += r.read()
  39. self.assertEqual(result, partialresult)
  40. # check that there's nothing left in the buffers
  41. self.assertEqual(r.read(), u"")
  42. self.assertEqual(r.bytebuffer, "")
  43. self.assertEqual(r.charbuffer, u"")
  44. # do the check again, this time using an incremental decoder
  45. d = codecs.getincrementaldecoder(self.encoding)()
  46. result = u""
  47. for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  48. result += d.decode(c)
  49. self.assertEqual(result, partialresult)
  50. # check that there's nothing left in the buffers
  51. self.assertEqual(d.decode("", True), u"")
  52. self.assertEqual(d.buffer, "")
  53. # Check whether the reset method works properly
  54. d.reset()
  55. result = u""
  56. for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  57. result += d.decode(c)
  58. self.assertEqual(result, partialresult)
  59. # check that there's nothing left in the buffers
  60. self.assertEqual(d.decode("", True), u"")
  61. self.assertEqual(d.buffer, "")
  62. # check iterdecode()
  63. encoded = input.encode(self.encoding)
  64. self.assertEqual(
  65. input,
  66. u"".join(codecs.iterdecode(encoded, self.encoding))
  67. )
  68. def test_readline(self):
  69. def getreader(input):
  70. stream = StringIO.StringIO(input.encode(self.encoding))
  71. return codecs.getreader(self.encoding)(stream)
  72. def readalllines(input, keepends=True, size=None):
  73. reader = getreader(input)
  74. lines = []
  75. while True:
  76. line = reader.readline(size=size, keepends=keepends)
  77. if not line:
  78. break
  79. lines.append(line)
  80. return "|".join(lines)
  81. s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
  82. sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
  83. sexpectednoends = u"foo|bar|baz|spam|eggs"
  84. self.assertEqual(readalllines(s, True), sexpected)
  85. self.assertEqual(readalllines(s, False), sexpectednoends)
  86. self.assertEqual(readalllines(s, True, 10), sexpected)
  87. self.assertEqual(readalllines(s, False, 10), sexpectednoends)
  88. lineends = ("\n", "\r\n", "\r", u"\u2028")
  89. # Test long lines (multiple calls to read() in readline())
  90. vw = []
  91. vwo = []
  92. for (i, lineend) in enumerate(lineends):
  93. vw.append((i*200+200)*u"\u3042" + lineend)
  94. vwo.append((i*200+200)*u"\u3042")
  95. self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
  96. self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
  97. # Test lines where the first read might end with \r, so the
  98. # reader has to look ahead whether this is a lone \r or a \r\n
  99. for size in xrange(80):
  100. for lineend in lineends:
  101. s = 10*(size*u"a" + lineend + u"xxx\n")
  102. reader = getreader(s)
  103. for i in xrange(10):
  104. self.assertEqual(
  105. reader.readline(keepends=True),
  106. size*u"a" + lineend,
  107. )
  108. self.assertEqual(
  109. reader.readline(keepends=True),
  110. "xxx\n",
  111. )
  112. reader = getreader(s)
  113. for i in xrange(10):
  114. self.assertEqual(
  115. reader.readline(keepends=False),
  116. size*u"a",
  117. )
  118. self.assertEqual(
  119. reader.readline(keepends=False),
  120. "xxx",
  121. )
  122. def test_mixed_readline_and_read(self):
  123. lines = ["Humpty Dumpty sat on a wall,\n",
  124. "Humpty Dumpty had a great fall.\r\n",
  125. "All the king's horses and all the king's men\r",
  126. "Couldn't put Humpty together again."]
  127. data = ''.join(lines)
  128. def getreader():
  129. stream = StringIO.StringIO(data.encode(self.encoding))
  130. return codecs.getreader(self.encoding)(stream)
  131. # Issue #8260: Test readline() followed by read()
  132. f = getreader()
  133. self.assertEqual(f.readline(), lines[0])
  134. self.assertEqual(f.read(), ''.join(lines[1:]))
  135. self.assertEqual(f.read(), '')
  136. # Issue #16636: Test readline() followed by readlines()
  137. f = getreader()
  138. self.assertEqual(f.readline(), lines[0])
  139. self.assertEqual(f.readlines(), lines[1:])
  140. self.assertEqual(f.read(), '')
  141. # Test read() followed by read()
  142. f = getreader()
  143. self.assertEqual(f.read(size=40, chars=5), data[:5])
  144. self.assertEqual(f.read(), data[5:])
  145. self.assertEqual(f.read(), '')
  146. # Issue #12446: Test read() followed by readlines()
  147. f = getreader()
  148. self.assertEqual(f.read(size=40, chars=5), data[:5])
  149. self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
  150. self.assertEqual(f.read(), '')
  151. def test_bug1175396(self):
  152. s = [
  153. '<%!--===================================================\r\n',
  154. ' BLOG index page: show recent articles,\r\n',
  155. ' today\'s articles, or articles of a specific date.\r\n',
  156. '========================================================--%>\r\n',
  157. '<%@inputencoding="ISO-8859-1"%>\r\n',
  158. '<%@pagetemplate=TEMPLATE.y%>\r\n',
  159. '<%@import=import frog.util, frog%>\r\n',
  160. '<%@import=import frog.objects%>\r\n',
  161. '<%@import=from frog.storageerrors import StorageError%>\r\n',
  162. '<%\r\n',
  163. '\r\n',
  164. 'import logging\r\n',
  165. 'log=logging.getLogger("Snakelets.logger")\r\n',
  166. '\r\n',
  167. '\r\n',
  168. 'user=self.SessionCtx.user\r\n',
  169. 'storageEngine=self.SessionCtx.storageEngine\r\n',
  170. '\r\n',
  171. '\r\n',
  172. 'def readArticlesFromDate(date, count=None):\r\n',
  173. ' entryids=storageEngine.listBlogEntries(date)\r\n',
  174. ' entryids.reverse() # descending\r\n',
  175. ' if count:\r\n',
  176. ' entryids=entryids[:count]\r\n',
  177. ' try:\r\n',
  178. ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
  179. ' except StorageError,x:\r\n',
  180. ' log.error("Error loading articles: "+str(x))\r\n',
  181. ' self.abort("cannot load articles")\r\n',
  182. '\r\n',
  183. 'showdate=None\r\n',
  184. '\r\n',
  185. 'arg=self.Request.getArg()\r\n',
  186. 'if arg=="today":\r\n',
  187. ' #-------------------- TODAY\'S ARTICLES\r\n',
  188. ' self.write("<h2>Today\'s articles</h2>")\r\n',
  189. ' showdate = frog.util.isodatestr() \r\n',
  190. ' entries = readArticlesFromDate(showdate)\r\n',
  191. 'elif arg=="active":\r\n',
  192. ' #-------------------- ACTIVE ARTICLES redirect\r\n',
  193. ' self.Yredirect("active.y")\r\n',
  194. 'elif arg=="login":\r\n',
  195. ' #-------------------- LOGIN PAGE redirect\r\n',
  196. ' self.Yredirect("login.y")\r\n',
  197. 'elif arg=="date":\r\n',
  198. ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
  199. ' showdate = self.Request.getParameter("date")\r\n',
  200. ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
  201. ' entries = readArticlesFromDate(showdate)\r\n',
  202. 'else:\r\n',
  203. ' #-------------------- RECENT ARTICLES\r\n',
  204. ' self.write("<h2>Recent articles</h2>")\r\n',
  205. ' dates=storageEngine.listBlogEntryDates()\r\n',
  206. ' if dates:\r\n',
  207. ' entries=[]\r\n',
  208. ' SHOWAMOUNT=10\r\n',
  209. ' for showdate in dates:\r\n',
  210. ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
  211. ' if len(entries)>=SHOWAMOUNT:\r\n',
  212. ' break\r\n',
  213. ' \r\n',
  214. ]
  215. stream = StringIO.StringIO("".join(s).encode(self.encoding))
  216. reader = codecs.getreader(self.encoding)(stream)
  217. for (i, line) in enumerate(reader):
  218. self.assertEqual(line, s[i])
  219. def test_readlinequeue(self):
  220. q = Queue()
  221. writer = codecs.getwriter(self.encoding)(q)
  222. reader = codecs.getreader(self.encoding)(q)
  223. # No lineends
  224. writer.write(u"foo\r")
  225. self.assertEqual(reader.readline(keepends=False), u"foo")
  226. writer.write(u"\nbar\r")
  227. self.assertEqual(reader.readline(keepends=False), u"")
  228. self.assertEqual(reader.readline(keepends=False), u"bar")
  229. writer.write(u"baz")
  230. self.assertEqual(reader.readline(keepends=False), u"baz")
  231. self.assertEqual(reader.readline(keepends=False), u"")
  232. # Lineends
  233. writer.write(u"foo\r")
  234. self.assertEqual(reader.readline(keepends=True), u"foo\r")
  235. writer.write(u"\nbar\r")
  236. self.assertEqual(reader.readline(keepends=True), u"\n")
  237. self.assertEqual(reader.readline(keepends=True), u"bar\r")
  238. writer.write(u"baz")
  239. self.assertEqual(reader.readline(keepends=True), u"baz")
  240. self.assertEqual(reader.readline(keepends=True), u"")
  241. writer.write(u"foo\r\n")
  242. self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
  243. def test_bug1098990_a(self):
  244. s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
  245. s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
  246. s3 = u"next line.\r\n"
  247. s = (s1+s2+s3).encode(self.encoding)
  248. stream = StringIO.StringIO(s)
  249. reader = codecs.getreader(self.encoding)(stream)
  250. self.assertEqual(reader.readline(), s1)
  251. self.assertEqual(reader.readline(), s2)
  252. self.assertEqual(reader.readline(), s3)
  253. self.assertEqual(reader.readline(), u"")
  254. def test_bug1098990_b(self):
  255. s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
  256. s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
  257. s3 = u"stillokay:bbbbxx\r\n"
  258. s4 = u"broken!!!!badbad\r\n"
  259. s5 = u"againokay.\r\n"
  260. s = (s1+s2+s3+s4+s5).encode(self.encoding)
  261. stream = StringIO.StringIO(s)
  262. reader = codecs.getreader(self.encoding)(stream)
  263. self.assertEqual(reader.readline(), s1)
  264. self.assertEqual(reader.readline(), s2)
  265. self.assertEqual(reader.readline(), s3)
  266. self.assertEqual(reader.readline(), s4)
  267. self.assertEqual(reader.readline(), s5)
  268. self.assertEqual(reader.readline(), u"")
  269. class UTF32Test(ReadTest):
  270. encoding = "utf-32"
  271. spamle = ('\xff\xfe\x00\x00'
  272. 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
  273. 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
  274. spambe = ('\x00\x00\xfe\xff'
  275. '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
  276. '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
  277. def test_only_one_bom(self):
  278. _,_,reader,writer = codecs.lookup(self.encoding)
  279. # encode some stream
  280. s = StringIO.StringIO()
  281. f = writer(s)
  282. f.write(u"spam")
  283. f.write(u"spam")
  284. d = s.getvalue()
  285. # check whether there is exactly one BOM in it
  286. self.assertTrue(d == self.spamle or d == self.spambe)
  287. # try to read it back
  288. s = StringIO.StringIO(d)
  289. f = reader(s)
  290. self.assertEqual(f.read(), u"spamspam")
  291. def test_badbom(self):
  292. s = StringIO.StringIO(4*"\xff")
  293. f = codecs.getreader(self.encoding)(s)
  294. self.assertRaises(UnicodeError, f.read)
  295. s = StringIO.StringIO(8*"\xff")
  296. f = codecs.getreader(self.encoding)(s)
  297. self.assertRaises(UnicodeError, f.read)
  298. def test_partial(self):
  299. self.check_partial(
  300. u"\x00\xff\u0100\uffff\U00010000",
  301. [
  302. u"", # first byte of BOM read
  303. u"", # second byte of BOM read
  304. u"", # third byte of BOM read
  305. u"", # fourth byte of BOM read => byteorder known
  306. u"",
  307. u"",
  308. u"",
  309. u"\x00",
  310. u"\x00",
  311. u"\x00",
  312. u"\x00",
  313. u"\x00\xff",
  314. u"\x00\xff",
  315. u"\x00\xff",
  316. u"\x00\xff",
  317. u"\x00\xff\u0100",
  318. u"\x00\xff\u0100",
  319. u"\x00\xff\u0100",
  320. u"\x00\xff\u0100",
  321. u"\x00\xff\u0100\uffff",
  322. u"\x00\xff\u0100\uffff",
  323. u"\x00\xff\u0100\uffff",
  324. u"\x00\xff\u0100\uffff",
  325. u"\x00\xff\u0100\uffff\U00010000",
  326. ]
  327. )
  328. def test_handlers(self):
  329. self.assertEqual((u'\ufffd', 1),
  330. codecs.utf_32_decode('\x01', 'replace', True))
  331. self.assertEqual((u'', 1),
  332. codecs.utf_32_decode('\x01', 'ignore', True))
  333. def test_errors(self):
  334. self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
  335. "\xff", "strict", True)
  336. def test_issue8941(self):
  337. # Issue #8941: insufficient result allocation when decoding into
  338. # surrogate pairs on UCS-2 builds.
  339. encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
  340. self.assertEqual(u'\U00010000' * 1024,
  341. codecs.utf_32_decode(encoded_le)[0])
  342. encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
  343. self.assertEqual(u'\U00010000' * 1024,
  344. codecs.utf_32_decode(encoded_be)[0])
  345. class UTF32LETest(ReadTest):
  346. encoding = "utf-32-le"
  347. def test_partial(self):
  348. self.check_partial(
  349. u"\x00\xff\u0100\uffff\U00010000",
  350. [
  351. u"",
  352. u"",
  353. u"",
  354. u"\x00",
  355. u"\x00",
  356. u"\x00",
  357. u"\x00",
  358. u"\x00\xff",
  359. u"\x00\xff",
  360. u"\x00\xff",
  361. u"\x00\xff",
  362. u"\x00\xff\u0100",
  363. u"\x00\xff\u0100",
  364. u"\x00\xff\u0100",
  365. u"\x00\xff\u0100",
  366. u"\x00\xff\u0100\uffff",
  367. u"\x00\xff\u0100\uffff",
  368. u"\x00\xff\u0100\uffff",
  369. u"\x00\xff\u0100\uffff",
  370. u"\x00\xff\u0100\uffff\U00010000",
  371. ]
  372. )
  373. def test_simple(self):
  374. self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
  375. def test_errors(self):
  376. self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
  377. "\xff", "strict", True)
  378. def test_issue8941(self):
  379. # Issue #8941: insufficient result allocation when decoding into
  380. # surrogate pairs on UCS-2 builds.
  381. encoded = '\x00\x00\x01\x00' * 1024
  382. self.assertEqual(u'\U00010000' * 1024,
  383. codecs.utf_32_le_decode(encoded)[0])
  384. class UTF32BETest(ReadTest):
  385. encoding = "utf-32-be"
  386. def test_partial(self):
  387. self.check_partial(
  388. u"\x00\xff\u0100\uffff\U00010000",
  389. [
  390. u"",
  391. u"",
  392. u"",
  393. u"\x00",
  394. u"\x00",
  395. u"\x00",
  396. u"\x00",
  397. u"\x00\xff",
  398. u"\x00\xff",
  399. u"\x00\xff",
  400. u"\x00\xff",
  401. u"\x00\xff\u0100",
  402. u"\x00\xff\u0100",
  403. u"\x00\xff\u0100",
  404. u"\x00\xff\u0100",
  405. u"\x00\xff\u0100\uffff",
  406. u"\x00\xff\u0100\uffff",
  407. u"\x00\xff\u0100\uffff",
  408. u"\x00\xff\u0100\uffff",
  409. u"\x00\xff\u0100\uffff\U00010000",
  410. ]
  411. )
  412. def test_simple(self):
  413. self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
  414. def test_errors(self):
  415. self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
  416. "\xff", "strict", True)
  417. def test_issue8941(self):
  418. # Issue #8941: insufficient result allocation when decoding into
  419. # surrogate pairs on UCS-2 builds.
  420. encoded = '\x00\x01\x00\x00' * 1024
  421. self.assertEqual(u'\U00010000' * 1024,
  422. codecs.utf_32_be_decode(encoded)[0])
  423. class UTF16Test(ReadTest):
  424. encoding = "utf-16"
  425. spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
  426. spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
  427. def test_only_one_bom(self):
  428. _,_,reader,writer = codecs.lookup(self.encoding)
  429. # encode some stream
  430. s = StringIO.StringIO()
  431. f = writer(s)
  432. f.write(u"spam")
  433. f.write(u"spam")
  434. d = s.getvalue()
  435. # check whether there is exactly one BOM in it
  436. self.assertTrue(d == self.spamle or d == self.spambe)
  437. # try to read it back
  438. s = StringIO.StringIO(d)
  439. f = reader(s)
  440. self.assertEqual(f.read(), u"spamspam")
  441. def test_badbom(self):
  442. s = StringIO.StringIO("\xff\xff")
  443. f = codecs.getreader(self.encoding)(s)
  444. self.assertRaises(UnicodeError, f.read)
  445. s = StringIO.StringIO("\xff\xff\xff\xff")
  446. f = codecs.getreader(self.encoding)(s)
  447. self.assertRaises(UnicodeError, f.read)
  448. def test_partial(self):
  449. self.check_partial(
  450. u"\x00\xff\u0100\uffff\U00010000",
  451. [
  452. u"", # first byte of BOM read
  453. u"", # second byte of BOM read => byteorder known
  454. u"",
  455. u"\x00",
  456. u"\x00",
  457. u"\x00\xff",
  458. u"\x00\xff",
  459. u"\x00\xff\u0100",
  460. u"\x00\xff\u0100",
  461. u"\x00\xff\u0100\uffff",
  462. u"\x00\xff\u0100\uffff",
  463. u"\x00\xff\u0100\uffff",
  464. u"\x00\xff\u0100\uffff",
  465. u"\x00\xff\u0100\uffff\U00010000",
  466. ]
  467. )
  468. def test_handlers(self):
  469. self.assertEqual((u'\ufffd', 1),
  470. codecs.utf_16_decode('\x01', 'replace', True))
  471. self.assertEqual((u'', 1),
  472. codecs.utf_16_decode('\x01', 'ignore', True))
  473. def test_errors(self):
  474. self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
  475. def test_bug691291(self):
  476. # Files are always opened in binary mode, even if no binary mode was
  477. # specified. This means that no automatic conversion of '\n' is done
  478. # on reading and writing.
  479. s1 = u'Hello\r\nworld\r\n'
  480. s = s1.encode(self.encoding)
  481. self.addCleanup(test_support.unlink, test_support.TESTFN)
  482. with open(test_support.TESTFN, 'wb') as fp:
  483. fp.write(s)
  484. with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
  485. self.assertEqual(reader.read(), s1)
  486. class UTF16LETest(ReadTest):
  487. encoding = "utf-16-le"
  488. def test_partial(self):
  489. self.check_partial(
  490. u"\x00\xff\u0100\uffff\U00010000",
  491. [
  492. u"",
  493. u"\x00",
  494. u"\x00",
  495. u"\x00\xff",
  496. u"\x00\xff",
  497. u"\x00\xff\u0100",
  498. u"\x00\xff\u0100",
  499. u"\x00\xff\u0100\uffff",
  500. u"\x00\xff\u0100\uffff",
  501. u"\x00\xff\u0100\uffff",
  502. u"\x00\xff\u0100\uffff",
  503. u"\x00\xff\u0100\uffff\U00010000",
  504. ]
  505. )
  506. def test_errors(self):
  507. tests = [
  508. (b'\xff', u'\ufffd'),
  509. (b'A\x00Z', u'A\ufffd'),
  510. (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
  511. (b'\x00\xd8', u'\ufffd'),
  512. (b'\x00\xd8A', u'\ufffd'),
  513. (b'\x00\xd8A\x00', u'\ufffdA'),
  514. (b'\x00\xdcA\x00', u'\ufffdA'),
  515. ]
  516. for raw, expected in tests:
  517. try:
  518. with self.assertRaises(UnicodeDecodeError):
  519. codecs.utf_16_le_decode(raw, 'strict', True)
  520. self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
  521. except:
  522. print 'raw=%r' % raw
  523. raise
  524. class UTF16BETest(ReadTest):
  525. encoding = "utf-16-be"
  526. def test_partial(self):
  527. self.check_partial(
  528. u"\x00\xff\u0100\uffff\U00010000",
  529. [
  530. u"",
  531. u"\x00",
  532. u"\x00",
  533. u"\x00\xff",
  534. u"\x00\xff",
  535. u"\x00\xff\u0100",
  536. u"\x00\xff\u0100",
  537. u"\x00\xff\u0100\uffff",
  538. u"\x00\xff\u0100\uffff",
  539. u"\x00\xff\u0100\uffff",
  540. u"\x00\xff\u0100\uffff",
  541. u"\x00\xff\u0100\uffff\U00010000",
  542. ]
  543. )
  544. def test_errors(self):
  545. tests = [
  546. (b'\xff', u'\ufffd'),
  547. (b'\x00A\xff', u'A\ufffd'),
  548. (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
  549. (b'\xd8\x00', u'\ufffd'),
  550. (b'\xd8\x00\xdc', u'\ufffd'),
  551. (b'\xd8\x00\x00A', u'\ufffdA'),
  552. (b'\xdc\x00\x00A', u'\ufffdA'),
  553. ]
  554. for raw, expected in tests:
  555. try:
  556. with self.assertRaises(UnicodeDecodeError):
  557. codecs.utf_16_be_decode(raw, 'strict', True)
  558. self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
  559. except:
  560. print 'raw=%r' % raw
  561. raise
  562. class UTF8Test(ReadTest):
  563. encoding = "utf-8"
  564. def test_partial(self):
  565. self.check_partial(
  566. u"\x00\xff\u07ff\u0800\uffff\U00010000",
  567. [
  568. u"\x00",
  569. u"\x00",
  570. u"\x00\xff",
  571. u"\x00\xff",
  572. u"\x00\xff\u07ff",
  573. u"\x00\xff\u07ff",
  574. u"\x00\xff\u07ff",
  575. u"\x00\xff\u07ff\u0800",
  576. u"\x00\xff\u07ff\u0800",
  577. u"\x00\xff\u07ff\u0800",
  578. u"\x00\xff\u07ff\u0800\uffff",
  579. u"\x00\xff\u07ff\u0800\uffff",
  580. u"\x00\xff\u07ff\u0800\uffff",
  581. u"\x00\xff\u07ff\u0800\uffff",
  582. u"\x00\xff\u07ff\u0800\uffff\U00010000",
  583. ]
  584. )
  585. class UTF7Test(ReadTest):
  586. encoding = "utf-7"
  587. def test_ascii(self):
  588. # Set D (directly encoded characters)
  589. set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  590. 'abcdefghijklmnopqrstuvwxyz'
  591. '0123456789'
  592. '\'(),-./:?')
  593. self.assertEqual(set_d.encode(self.encoding), set_d)
  594. self.assertEqual(set_d.decode(self.encoding), set_d)
  595. # Set O (optional direct characters)
  596. set_o = ' !"#$%&*;<=>@[]^_`{|}'
  597. self.assertEqual(set_o.encode(self.encoding), set_o)
  598. self.assertEqual(set_o.decode(self.encoding), set_o)
  599. # +
  600. self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
  601. self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
  602. # White spaces
  603. ws = ' \t\n\r'
  604. self.assertEqual(ws.encode(self.encoding), ws)
  605. self.assertEqual(ws.decode(self.encoding), ws)
  606. # Other ASCII characters
  607. other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
  608. set(set_d + set_o + '+' + ws)))
  609. self.assertEqual(other_ascii.encode(self.encoding),
  610. '+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
  611. 'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
  612. def test_partial(self):
  613. self.check_partial(
  614. u"a+-b",
  615. [
  616. u"a",
  617. u"a",
  618. u"a+",
  619. u"a+-",
  620. u"a+-b",
  621. ]
  622. )
  623. def test_errors(self):
  624. tests = [
  625. ('\xe1b', u'\ufffdb'),
  626. ('a\xe1b', u'a\ufffdb'),
  627. ('a\xe1\xe1b', u'a\ufffd\ufffdb'),
  628. ('a+IK', u'a\ufffd'),
  629. ('a+IK-b', u'a\ufffdb'),
  630. ('a+IK,b', u'a\ufffdb'),
  631. ('a+IKx', u'a\u20ac\ufffd'),
  632. ('a+IKx-b', u'a\u20ac\ufffdb'),
  633. ('a+IKwgr', u'a\u20ac\ufffd'),
  634. ('a+IKwgr-b', u'a\u20ac\ufffdb'),
  635. ('a+IKwgr,', u'a\u20ac\ufffd'),
  636. ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
  637. ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
  638. ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
  639. ('a+/,+IKw-b', u'a\ufffd\u20acb'),
  640. ('a+//,+IKw-b', u'a\ufffd\u20acb'),
  641. ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
  642. ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
  643. ('a+IKw-b\xe1', u'a\u20acb\ufffd'),
  644. ('a+IKw\xe1b', u'a\u20ac\ufffdb'),
  645. ]
  646. for raw, expected in tests:
  647. try:
  648. with self.assertRaises(UnicodeDecodeError):
  649. codecs.utf_7_decode(raw, 'strict', True)
  650. self.assertEqual(raw.decode('utf-7', 'replace'), expected)
  651. except:
  652. print 'raw=%r' % raw
  653. raise
  654. def test_nonbmp(self):
  655. self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
  656. self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
  657. self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
  658. self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
  659. self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
  660. self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
  661. self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
  662. self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
  663. '+IKwgrNgB3KA-')
  664. self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
  665. u'\u20ac\u20ac\U000104A0')
  666. self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
  667. u'\u20ac\u20ac\U000104A0')
  668. def test_lone_surrogates(self):
  669. tests = [
  670. ('a+2AE-b', u'a\ud801b'),
  671. ('a+2AE\xe1b', u'a\ufffdb'),
  672. ('a+2AE', u'a\ufffd'),
  673. ('a+2AEA-b', u'a\ufffdb'),
  674. ('a+2AH-b', u'a\ufffdb'),
  675. ('a+IKzYAQ-b', u'a\u20ac\ud801b'),
  676. ('a+IKzYAQ\xe1b', u'a\u20ac\ufffdb'),
  677. ('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
  678. ('a+IKzYAd-b', u'a\u20ac\ufffdb'),
  679. ('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
  680. ('a+IKwgrNgB\xe1b', u'a\u20ac\u20ac\ufffdb'),
  681. ('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
  682. ('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
  683. ]
  684. for raw, expected in tests:
  685. try:
  686. self.assertEqual(raw.decode('utf-7', 'replace'), expected)
  687. except:
  688. print 'raw=%r' % raw
  689. raise
  690. class UTF16ExTest(unittest.TestCase):
  691. def test_errors(self):
  692. self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
  693. def test_bad_args(self):
  694. self.assertRaises(TypeError, codecs.utf_16_ex_decode)
  695. class ReadBufferTest(unittest.TestCase):
  696. def test_array(self):
  697. import array
  698. self.assertEqual(
  699. codecs.readbuffer_encode(array.array("c", "spam")),
  700. ("spam", 4)
  701. )
  702. def test_empty(self):
  703. self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
  704. def test_bad_args(self):
  705. self.assertRaises(TypeError, codecs.readbuffer_encode)
  706. self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
  707. class CharBufferTest(unittest.TestCase):
  708. def test_string(self):
  709. self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
  710. def test_empty(self):
  711. self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
  712. def test_bad_args(self):
  713. self.assertRaises(TypeError, codecs.charbuffer_encode)
  714. self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
  715. class UTF8SigTest(ReadTest):
  716. encoding = "utf-8-sig"
  717. def test_partial(self):
  718. self.check_partial(
  719. u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
  720. [
  721. u"",
  722. u"",
  723. u"", # First BOM has been read and skipped
  724. u"",
  725. u"",
  726. u"\ufeff", # Second BOM has been read and emitted
  727. u"\ufeff\x00", # "\x00" read and emitted
  728. u"\ufeff\x00", # First byte of encoded u"\xff" read
  729. u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
  730. u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
  731. u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
  732. u"\ufeff\x00\xff\u07ff",
  733. u"\ufeff\x00\xff\u07ff",
  734. u"\ufeff\x00\xff\u07ff\u0800",
  735. u"\ufeff\x00\xff\u07ff\u0800",
  736. u"\ufeff\x00\xff\u07ff\u0800",
  737. u"\ufeff\x00\xff\u07ff\u0800\uffff",
  738. u"\ufeff\x00\xff\u07ff\u0800\uffff",
  739. u"\ufeff\x00\xff\u07ff\u0800\uffff",
  740. u"\ufeff\x00\xff\u07ff\u0800\uffff",
  741. u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
  742. ]
  743. )
  744. def test_bug1601501(self):
  745. # SF bug #1601501: check that the codec works with a buffer
  746. unicode("\xef\xbb\xbf", "utf-8-sig")
  747. def test_bom(self):
  748. d = codecs.getincrementaldecoder("utf-8-sig")()
  749. s = u"spam"
  750. self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
  751. def test_stream_bom(self):
  752. unistring = u"ABC\u00A1\u2200XYZ"
  753. bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
  754. reader = codecs.getreader("utf-8-sig")
  755. for sizehint in [None] + range(1, 11) + \
  756. [64, 128, 256, 512, 1024]:
  757. istream = reader(StringIO.StringIO(bytestring))
  758. ostream = StringIO.StringIO()
  759. while 1:
  760. if sizehint is not None:
  761. data = istream.read(sizehint)
  762. else:
  763. data = istream.read()
  764. if not data:
  765. break
  766. ostream.write(data)
  767. got = ostream.getvalue()
  768. self.assertEqual(got, unistring)
  769. def test_stream_bare(self):
  770. unistring = u"ABC\u00A1\u2200XYZ"
  771. bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
  772. reader = codecs.getreader("utf-8-sig")
  773. for sizehint in [None] + range(1, 11) + \
  774. [64, 128, 256, 512, 1024]:
  775. istream = reader(StringIO.StringIO(bytestring))
  776. ostream = StringIO.StringIO()
  777. while 1:
  778. if sizehint is not None:
  779. data = istream.read(sizehint)
  780. else:
  781. data = istream.read()
  782. if not data:
  783. break
  784. ostream.write(data)
  785. got = ostream.getvalue()
  786. self.assertEqual(got, unistring)
  787. class EscapeDecodeTest(unittest.TestCase):
  788. def test_empty(self):
  789. self.assertEqual(codecs.escape_decode(""), ("", 0))
  790. def test_raw(self):
  791. decode = codecs.escape_decode
  792. for b in range(256):
  793. b = chr(b)
  794. if b != '\\':
  795. self.assertEqual(decode(b + '0'), (b + '0', 2))
  796. def test_escape(self):
  797. decode = codecs.escape_decode
  798. check = coding_checker(self, decode)
  799. check(b"[\\\n]", b"[]")
  800. check(br'[\"]', b'["]')
  801. check(br"[\']", b"[']")
  802. check(br"[\\]", br"[\]")
  803. check(br"[\a]", b"[\x07]")
  804. check(br"[\b]", b"[\x08]")
  805. check(br"[\t]", b"[\x09]")
  806. check(br"[\n]", b"[\x0a]")
  807. check(br"[\v]", b"[\x0b]")
  808. check(br"[\f]", b"[\x0c]")
  809. check(br"[\r]", b"[\x0d]")
  810. check(br"[\7]", b"[\x07]")
  811. check(br"[\8]", br"[\8]")
  812. check(br"[\78]", b"[\x078]")
  813. check(br"[\41]", b"[!]")
  814. check(br"[\418]", b"[!8]")
  815. check(br"[\101]", b"[A]")
  816. check(br"[\1010]", b"[A0]")
  817. check(br"[\501]", b"[A]")
  818. check(br"[\x41]", b"[A]")
  819. check(br"[\X41]", br"[\X41]")
  820. check(br"[\x410]", b"[A0]")
  821. for b in range(256):
  822. b = chr(b)
  823. if b not in '\n"\'\\abtnvfr01234567x':
  824. check('\\' + b, '\\' + b)
  825. def test_errors(self):
  826. decode = codecs.escape_decode
  827. self.assertRaises(ValueError, decode, br"\x")
  828. self.assertRaises(ValueError, decode, br"[\x]")
  829. self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
  830. self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
  831. self.assertRaises(ValueError, decode, br"\x0")
  832. self.assertRaises(ValueError, decode, br"[\x0]")
  833. self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
  834. self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
  835. class RecodingTest(unittest.TestCase):
  836. def test_recoding(self):
  837. f = StringIO.StringIO()
  838. f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
  839. f2.write(u"a")
  840. f2.close()
  841. # Python used to crash on this at exit because of a refcount
  842. # bug in _codecsmodule.c
  843. # From RFC 3492
  844. punycode_testcases = [
  845. # A Arabic (Egyptian):
  846. (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
  847. u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
  848. "egbpdaj6bu4bxfgehfvwxn"),
  849. # B Chinese (simplified):
  850. (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
  851. "ihqwcrb4cv8a8dqg056pqjye"),
  852. # C Chinese (traditional):
  853. (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
  854. "ihqwctvzc91f659drss3x8bo0yb"),
  855. # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
  856. (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
  857. u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
  858. u"\u0065\u0073\u006B\u0079",
  859. "Proprostnemluvesky-uyb24dma41a"),
  860. # E Hebrew:
  861. (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
  862. u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
  863. u"\u05D1\u05E8\u05D9\u05EA",
  864. "4dbcagdahymbxekheh6e0a7fei0b"),
  865. # F Hindi (Devanagari):
  866. (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
  867. u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
  868. u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
  869. u"\u0939\u0948\u0902",
  870. "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
  871. #(G) Japanese (kanji and hiragana):
  872. (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
  873. u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
  874. "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
  875. # (H) Korean (Hangul syllables):
  876. (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
  877. u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
  878. u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
  879. "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
  880. "psd879ccm6fea98c"),
  881. # (I) Russian (Cyrillic):
  882. (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
  883. u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
  884. u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
  885. u"\u0438",
  886. "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
  887. # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
  888. (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
  889. u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
  890. u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
  891. u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
  892. u"\u0061\u00F1\u006F\u006C",
  893. "PorqunopuedensimplementehablarenEspaol-fmd56a"),
  894. # (K) Vietnamese:
  895. # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
  896. # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
  897. (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
  898. u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
  899. u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
  900. u"\u0056\u0069\u1EC7\u0074",
  901. "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
  902. #(L) 3<nen>B<gumi><kinpachi><sensei>
  903. (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
  904. "3B-ww4c5e180e575a65lsy2b"),
  905. # (M) <amuro><namie>-with-SUPER-MONKEYS
  906. (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
  907. u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
  908. u"\u004F\u004E\u004B\u0045\u0059\u0053",
  909. "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
  910. # (N) Hello-Another-Way-<sorezore><no><basho>
  911. (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
  912. u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
  913. u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
  914. "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
  915. # (O) <hitotsu><yane><no><shita>2
  916. (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
  917. "2-u9tlzr9756bt3uc0v"),
  918. # (P) Maji<de>Koi<suru>5<byou><mae>
  919. (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
  920. u"\u308B\u0035\u79D2\u524D",
  921. "MajiKoi5-783gue6qz075azm5e"),
  922. # (Q) <pafii>de<runba>
  923. (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
  924. "de-jg4avhby1noc0d"),
  925. # (R) <sono><supiido><de>
  926. (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
  927. "d9juau41awczczp"),
  928. # (S) -> $1.00 <-
  929. (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
  930. u"\u003C\u002D",
  931. "-> $1.00 <--")
  932. ]
  933. for i in punycode_testcases:
  934. if len(i)!=2:
  935. print repr(i)
  936. class PunycodeTest(unittest.TestCase):
  937. def test_encode(self):
  938. for uni, puny in punycode_testcases:
  939. # Need to convert both strings to lower case, since
  940. # some of the extended encodings use upper case, but our
  941. # code produces only lower case. Converting just puny to
  942. # lower is also insufficient, since some of the input characters
  943. # are upper case.
  944. self.assertEqual(uni.encode("punycode").lower(), puny.lower())
  945. def test_decode(self):
  946. for uni, puny in punycode_testcases:
  947. self.assertEqual(uni, puny.decode("punycode"))
  948. class UnicodeInternalTest(unittest.TestCase):
  949. def test_bug1251300(self):
  950. # Decoding with unicode_internal used to not correctly handle "code
  951. # points" above 0x10ffff on UCS-4 builds.
  952. if sys.maxunicode > 0xffff:
  953. ok = [
  954. ("\x00\x10\xff\xff", u"\U0010ffff"),
  955. ("\x00\x00\x01\x01", u"\U00000101"),
  956. ("", u""),
  957. ]
  958. not_ok = [
  959. "\x7f\xff\xff\xff",
  960. "\x80\x00\x00\x00",
  961. "\x81\x00\x00\x00",
  962. "\x00",
  963. "\x00\x00\x00\x00\x00",
  964. ]
  965. for internal, uni in ok:
  966. if sys.byteorder == "little":
  967. internal = "".join(reversed(internal))
  968. self.assertEqual(uni, internal.decode("unicode_internal"))
  969. for internal in not_ok:
  970. if sys.byteorder == "little":
  971. internal = "".join(reversed(internal))
  972. self.assertRaises(UnicodeDecodeError, internal.decode,
  973. "unicode_internal")
  974. def test_decode_error_attributes(self):
  975. if sys.maxunicode > 0xffff:
  976. try:
  977. "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
  978. except UnicodeDecodeError, ex:
  979. self.assertEqual("unicode_internal", ex.encoding)
  980. self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
  981. self.assertEqual(4, ex.start)
  982. self.assertEqual(8, ex.end)
  983. else:
  984. self.fail()
  985. def test_decode_callback(self):
  986. if sys.maxunicode > 0xffff:
  987. codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
  988. decoder = codecs.getdecoder("unicode_internal")
  989. ab = u"ab".encode("unicode_internal")
  990. ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
  991. "UnicodeInternalTest")
  992. self.assertEqual((u"ab", 12), ignored)
  993. def test_encode_length(self):
  994. # Issue 3739
  995. encoder = codecs.getencoder("unicode_internal")
  996. self.assertEqual(encoder(u"a")[1], 1)
  997. self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
  998. encoder = codecs.getencoder("string-escape")
  999. self.assertEqual(encoder(r'\x00')[1], 4)
  1000. # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
  1001. nameprep_tests = [
  1002. # 3.1 Map to nothing.
  1003. ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
  1004. '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
  1005. '\xb8\x8f\xef\xbb\xbf',
  1006. 'foobarbaz'),
  1007. # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
  1008. ('CAFE',
  1009. 'cafe'),
  1010. # 3.3 Case folding 8bit U+00DF (german sharp s).
  1011. # The original test case is bogus; it says \xc3\xdf
  1012. ('\xc3\x9f',
  1013. 'ss'),
  1014. # 3.4 Case folding U+0130 (turkish capital I with dot).
  1015. ('\xc4\xb0',
  1016. 'i\xcc\x87'),
  1017. # 3.5 Case folding multibyte U+0143 U+037A.
  1018. ('\xc5\x83\xcd\xba',
  1019. '\xc5\x84 \xce\xb9'),
  1020. # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
  1021. # XXX: skip this as it fails in UCS-2 mode
  1022. #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
  1023. # 'telc\xe2\x88\x95kg\xcf\x83'),
  1024. (None, None),
  1025. # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
  1026. ('j\xcc\x8c\xc2\xa0\xc2\xaa',
  1027. '\xc7\xb0 a'),
  1028. # 3.8 Case folding U+1FB7 and normalization.
  1029. ('\xe1\xbe\xb7',
  1030. '\xe1\xbe\xb6\xce\xb9'),
  1031. # 3.9 Self-reverting case folding U+01F0 and normalization.
  1032. # The original test case is bogus, it says `\xc7\xf0'
  1033. ('\xc7\xb0',
  1034. '\xc7\xb0'),
  1035. # 3.10 Self-reverting case folding U+0390 and normalization.
  1036. ('\xce\x90',
  1037. '\xce\x90'),
  1038. # 3.11 Self-reverting case folding U+03B0 and normalization.
  1039. ('\xce\xb0',
  1040. '\xce\xb0'),
  1041. # 3.12 Self-reverting case folding U+1E96 and normalization.
  1042. ('\xe1\xba\x96',
  1043. '\xe1\xba\x96'),
  1044. # 3.13 Self-reverting case folding U+1F56 and normalization.
  1045. ('\xe1\xbd\x96',
  1046. '\xe1\xbd\x96'),
  1047. # 3.14 ASCII space character U+0020.
  1048. (' ',
  1049. ' '),
  1050. # 3.15 Non-ASCII 8bit space character U+00A0.
  1051. ('\xc2\xa0',
  1052. ' '),
  1053. # 3.16 Non-ASCII multibyte space character U+1680.
  1054. ('\xe1\x9a\x80',
  1055. None),
  1056. # 3.17 Non-ASCII multibyte space character U+2000.
  1057. ('\xe2\x80\x80',
  1058. ' '),
  1059. # 3.18 Zero Width Space U+200b.
  1060. ('\xe2\x80\x8b',
  1061. ''),
  1062. # 3.19 Non-ASCII multibyte space character U+3000.
  1063. ('\xe3\x80\x80',
  1064. ' '),
  1065. # 3.20 ASCII control characters U+0010 U+007F.
  1066. ('\x10\x7f',
  1067. '\x10\x7f'),
  1068. # 3.21 Non-ASCII 8bit control character U+0085.
  1069. ('\xc2\x85',
  1070. None),
  1071. # 3.22 Non-ASCII multibyte control character U+180E.
  1072. ('\xe1\xa0\x8e',
  1073. None),
  1074. # 3.23 Zero Width No-Break Space U+FEFF.
  1075. ('\xef\xbb\xbf',
  1076. ''),
  1077. # 3.24 Non-ASCII control character U+1D175.
  1078. ('\xf0\x9d\x85\xb5',
  1079. None),
  1080. # 3.25 Plane 0 private use character U+F123.
  1081. ('\xef\x84\xa3',
  1082. None),
  1083. # 3.26 Plane 15 private use character U+F1234.
  1084. ('\xf3\xb1\x88\xb4',
  1085. None),
  1086. # 3.27 Plane 16 private use character U+10F234.
  1087. ('\xf4\x8f\x88\xb4',
  1088. None),
  1089. # 3.28 Non-character code point U+8FFFE.
  1090. ('\xf2\x8f\xbf\xbe',
  1091. None),
  1092. # 3.29 Non-character code point U+10FFFF.
  1093. ('\xf4\x8f\xbf\xbf',
  1094. None),
  1095. # 3.30 Surrogate code U+DF42.
  1096. ('\xed\xbd\x82',
  1097. None),
  1098. # 3.31 Non-plain text character U+FFFD.
  1099. ('\xef\xbf\xbd',
  1100. None),
  1101. # 3.32 Ideographic description character U+2FF5.
  1102. ('\xe2\xbf\xb5',
  1103. None),
  1104. # 3.33 Display property character U+0341.
  1105. ('\xcd\x81',
  1106. '\xcc\x81'),
  1107. # 3.34 Left-to-right mark U+200E.
  1108. ('\xe2\x80\x8e',
  1109. None),
  1110. # 3.35 Deprecated U+202A.
  1111. ('\xe2\x80\xaa',
  1112. None),
  1113. # 3.36 Language tagging character U+E0001.
  1114. ('\xf3\xa0\x80\x81',
  1115. None),
  1116. # 3.37 Language tagging character U+E0042.
  1117. ('\xf3\xa0\x81\x82',
  1118. None),
  1119. # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
  1120. ('foo\xd6\xbebar',
  1121. None),
  1122. # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
  1123. ('foo\xef\xb5\x90bar',
  1124. None),
  1125. # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
  1126. ('foo\xef\xb9\xb6bar',
  1127. 'foo \xd9\x8ebar'),
  1128. # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
  1129. ('\xd8\xa71',
  1130. None),
  1131. # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
  1132. ('\xd8\xa71\xd8\xa8',
  1133. '\xd8\xa71\xd8\xa8'),
  1134. # 3.43 Unassigned code point U+E0002.
  1135. # Skip this test as we allow unassigned
  1136. #('\xf3\xa0\x80\x82',
  1137. # None),
  1138. (None, None),
  1139. # 3.44 Larger test (shrinking).
  1140. # Original test case reads \xc3\xdf
  1141. ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
  1142. '\xaa\xce\xb0\xe2\x80\x80',
  1143. 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
  1144. # 3.45 Larger test (expanding).
  1145. # Original test case reads \xc3\x9f
  1146. ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
  1147. '\x80',
  1148. 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
  1149. '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
  1150. '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
  1151. ]
  1152. class NameprepTest(unittest.TestCase):
  1153. def test_nameprep(self):
  1154. from encodings.idna import nameprep
  1155. for pos, (orig, prepped) in enumerate(nameprep_tests):
  1156. if orig is None:
  1157. # Skipped
  1158. continue
  1159. # The Unicode strings are given in UTF-8
  1160. orig = unicode(orig, "utf-8")
  1161. if prepped is None:
  1162. # Input contains prohibited characters
  1163. self.assertRaises(UnicodeError, nameprep, orig)
  1164. else:
  1165. prepped = unicode(prepped, "utf-8")
  1166. try:
  1167. self.assertEqual(nameprep(orig), prepped)
  1168. except Exception,e:
  1169. raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
  1170. class IDNACodecTest(unittest.TestCase):
  1171. def test_builtin_decode(self):
  1172. self.assertEqual(unicode("python.org", "idna"), u"python.org")
  1173. self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
  1174. self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
  1175. self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
  1176. def test_builtin_encode(self):
  1177. self.assertEqual(u"python.org".encode("idna"), "python.org")
  1178. self.assertEqual("python.org.".encode("idna"), "python.org.")
  1179. self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
  1180. self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
  1181. def test_stream(self):
  1182. import StringIO
  1183. r = codecs.getreader("idna")(StringIO.StringIO("abc"))
  1184. r.read(3)
  1185. self.assertEqual(r.read(), u"")
  1186. def test_incremental_decode(self):
  1187. self.assertEqual(
  1188. "".join(codecs.iterdecode("python.org", "idna")),
  1189. u"python.org"
  1190. )
  1191. self.assertEqual(
  1192. "".join(codecs.iterdecode("python.org.", "idna")),
  1193. u"python.org."
  1194. )
  1195. self.assertEqual(
  1196. "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
  1197. u"pyth\xf6n.org."
  1198. )
  1199. self.assertEqual(
  1200. "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
  1201. u"pyth\xf6n.org."
  1202. )
  1203. decoder = codecs.getincrementaldecoder("idna")()
  1204. self.assertEqual(decoder.decode("xn--xam", ), u"")
  1205. self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
  1206. self.assertEqual(decoder.decode(u"rg"), u"")
  1207. self.assertEqual(decoder.decode(u"", True), u"org")
  1208. decoder.reset()
  1209. self.assertEqual(decoder.decode("xn--xam", ), u"")
  1210. self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
  1211. self.assertEqual(decoder.decode("rg."), u"org.")
  1212. self.assertEqual(decoder.decode("", True), u"")
  1213. def test_incremental_encode(self):
  1214. self.assertEqual(
  1215. "".join(codecs.iterencode(u"python.org", "idna")),
  1216. "python.org"
  1217. )
  1218. self.assertEqual(
  1219. "".join(codecs.iterencode(u"python.org.", "idna")),
  1220. "python.org."
  1221. )
  1222. self.assertEqual(
  1223. "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
  1224. "xn--pythn-mua.org."
  1225. )
  1226. self.assertEqual(
  1227. "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
  1228. "xn--pythn-mua.org."
  1229. )
  1230. encoder = codecs.getincrementalencoder("idna")()
  1231. self.assertEqual(encoder.encode(u"\xe4x"), "")
  1232. self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
  1233. self.assertEqual(encoder.encode(u"", True), "org")
  1234. encoder.reset()
  1235. self.assertEqual(encoder.encode(u"\xe4x"), "")
  1236. self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
  1237. self.assertEqual(encoder.encode(u"", True), "")
  1238. class CodecsModuleTest(unittest.TestCase):
  1239. def test_decode(self):
  1240. self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
  1241. u'\xe4\xf6\xfc')
  1242. self.assertRaises(TypeError, codecs.decode)
  1243. self.assertEqual(codecs.decode('abc'), u'abc')
  1244. self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
  1245. def test_encode(self):
  1246. self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
  1247. '\xe4\xf6\xfc')
  1248. self.assertRaises(TypeError, codecs.encode)
  1249. self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
  1250. self.assertEqual(codecs.encode(u'abc'), 'abc')
  1251. self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
  1252. def test_register(self):
  1253. self.assertRaises(TypeError, codecs.register)
  1254. self.assertRaises(TypeError, codecs.register, 42)
  1255. def test_lookup(self):
  1256. self.assertRaises(TypeError, codecs.lookup)
  1257. self.assertRaises(LookupError, codecs.lookup, "__spam__")
  1258. self.assertRaises(LookupError, codecs.lookup, " ")
  1259. def test_getencoder(self):
  1260. self.assertRaises(TypeError, codecs.getencoder)
  1261. self.assertRaises(LookupError, codecs.getencoder, "__spam__")
  1262. def test_getdecoder(self):
  1263. self.assertRaises(TypeError, codecs.getdecoder)
  1264. self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
  1265. def test_getreader(self):
  1266. self.assertRaises(TypeError, codecs.getreader)
  1267. self.assertRaises(LookupError, codecs.getreader, "__spam__")
  1268. def test_getwriter(self):
  1269. self.assertRaises(TypeError, codecs.getwriter)
  1270. self.assertRaises(LookupError, codecs.getwriter, "__spam__")
  1271. def test_lookup_issue1813(self):
  1272. # Issue #1813: under Turkish locales, lookup of some codecs failed
  1273. # because 'I' is lowercased as a dotless "i"
  1274. oldlocale = locale.getlocale(locale.LC_CTYPE)
  1275. self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
  1276. try:
  1277. locale.setlocale(locale.LC_CTYPE, 'tr_TR')
  1278. except locale.Error:
  1279. # Unsupported locale on this system
  1280. self.skipTest('test needs Turkish locale')
  1281. c = codecs.lookup('ASCII')
  1282. self.assertEqual(c.name, 'ascii')
  1283. def test_all(self):
  1284. api = (
  1285. "encode", "decode",
  1286. "register", "CodecInfo", "Codec", "IncrementalEncoder",
  1287. "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
  1288. "getencoder", "getdecoder", "getincrementalencoder",
  1289. "getincrementaldecoder", "getreader", "getwriter",
  1290. "register_error", "lookup_error",
  1291. "strict_errors", "replace_errors", "ignore_errors",
  1292. "xmlcharrefreplace_errors", "backslashreplace_errors",
  1293. "open", "EncodedFile",
  1294. "iterencode", "iterdecode",
  1295. "BOM", "BOM_BE", "BOM_LE",
  1296. "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
  1297. "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
  1298. "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
  1299. "StreamReaderWriter", "StreamRecoder",
  1300. )
  1301. self.assertEqual(sorted(api), sorted(codecs.__all__))
  1302. for api in codecs.__all__:
  1303. getattr(codecs, api)
  1304. class StreamReaderTest(unittest.TestCase):
  1305. def setUp(self):
  1306. self.reader = codecs.getreader('utf-8')
  1307. self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
  1308. def test_readlines(self):
  1309. f = self.reader(self.stream)
  1310. self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
  1311. class EncodedFileTest(unittest.TestCase):
  1312. def test_basic(self):
  1313. f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
  1314. ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
  1315. self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
  1316. f = StringIO.StringIO()
  1317. ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
  1318. ef.write('\xc3\xbc')
  1319. self.assertEqual(f.getvalue(), '\xfc')
  1320. class Str2StrTest(unittest.TestCase):
  1321. def test_read(self):
  1322. sin = codecs.encode("\x80", "base64_codec")
  1323. reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
  1324. sout = reader.read()
  1325. self.assertEqual(sout, "\x80")
  1326. self.assertIsInstance(sout, str)
  1327. def test_readline(self):
  1328. sin = codecs.encode("\x80", "base64_codec")
  1329. reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
  1330. sout = reader.readline()
  1331. self.assertEqual(sout, "\x80")
  1332. self.assertIsInstance(sout, str)
  1333. all_unicode_encodings = [
  1334. "ascii",
  1335. "base64_codec",
  1336. "big5",
  1337. "big5hkscs",
  1338. "charmap",
  1339. "cp037",
  1340. "cp1006",
  1341. "cp1026",
  1342. "cp1140",
  1343. "cp1250",
  1344. "cp1251",
  1345. "cp1252",
  1346. "cp1253",
  1347. "cp1254",
  1348. "cp1255",
  1349. "cp1256",
  1350. "cp1257",
  1351. "cp1258",
  1352. "cp424",
  1353. "cp437",
  1354. "cp500",
  1355. "cp720",
  1356. "cp737",
  1357. "cp775",
  1358. "cp850",
  1359. "cp852",
  1360. "cp855",
  1361. "cp856",
  1362. "cp857",
  1363. "cp858",
  1364. "cp860",
  1365. "cp861",
  1366. "cp862",
  1367. "cp863",
  1368. "cp864",
  1369. "cp865",
  1370. "cp866",
  1371. "cp869",
  1372. "cp874",
  1373. "cp875",
  1374. "cp932",
  1375. "cp949",
  1376. "cp950",
  1377. "euc_jis_2004",
  1378. "euc_jisx0213",
  1379. "euc_jp",
  1380. "euc_kr",
  1381. "gb18030",
  1382. "gb2312",
  1383. "gbk",
  1384. "hex_codec",
  1385. "hp_roman8",
  1386. "hz",
  1387. "idna",
  1388. "iso2022_jp",
  1389. "iso2022_jp_1",
  1390. "iso2022_jp_2",
  1391. "iso2022_jp_2004",
  1392. "iso2022_jp_3",
  1393. "iso2022_jp_ext",
  1394. "iso2022_kr",
  1395. "iso8859_1",
  1396. "iso8859_10",
  1397. "iso8859_11",
  1398. "iso8859_13",
  1399. "iso8859_14",
  1400. "iso8859_15",
  1401. "iso8859_16",
  1402. "iso8859_2",
  1403. "iso8859_3",
  1404. "iso8859_4",
  1405. "iso8859_5",
  1406. "iso8859_6",
  1407. "iso8859_7",
  1408. "iso8859_8",
  1409. "iso8859_9",
  1410. "johab",
  1411. "koi8_r",
  1412. "koi8_u",
  1413. "latin_1",
  1414. "mac_cyrillic",
  1415. "mac_greek",
  1416. "mac_iceland",
  1417. "mac_latin2",
  1418. "mac_roman",
  1419. "mac_turkish",
  1420. "palmos",
  1421. "ptcp154",
  1422. "punycode",
  1423. "raw_unicode_escape",
  1424. "rot_13",
  1425. "shift_jis",
  1426. "shift_jis_2004",
  1427. "shift_jisx0213",
  1428. "tis_620",
  1429. "unicode_escape",
  1430. "unicode_internal",
  1431. "utf_16",
  1432. "utf_16_be",
  1433. "utf_16_le",
  1434. "utf_7",
  1435. "utf_8",
  1436. ]
  1437. if hasattr(codecs, "mbcs_encode"):
  1438. all_unicode_encodings.append("mbcs")
  1439. # The following encodings work only with str, not unicode
  1440. all_string_encodings = [
  1441. "quopri_codec",
  1442. "string_escape",
  1443. "uu_codec",
  1444. ]
  1445. # The following encoding is not tested, because it's not supposed
  1446. # to work:
  1447. # "undefined"
  1448. # The following encodings don't work in stateful mode
  1449. broken_unicode_with_streams = [
  1450. "base64_codec",
  1451. "hex_codec",
  1452. "punycode",
  1453. "unicode_internal"
  1454. ]
  1455. broken_incremental_coders = broken_unicode_with_streams[:]
  1456. if sys.flags.py3k_warning:
  1457. broken_unicode_with_streams.append("rot_13")
  1458. # The following encodings only support "strict" mode
  1459. only_strict_mode = [
  1460. "idna",
  1461. "zlib_codec",
  1462. "bz2_codec",
  1463. ]
  1464. try:
  1465. import bz2
  1466. except ImportError:
  1467. pass
  1468. else:
  1469. all_unicode_encodings.append("bz2_codec")
  1470. broken_unicode_with_streams.append("bz2_codec")
  1471. try:
  1472. import zlib
  1473. except ImportError:
  1474. pass
  1475. else:
  1476. all_unicode_encodings.append("zlib_codec")
  1477. broken_unicode_with_streams.append("zlib_codec")
  1478. class BasicUnicodeTest(unittest.TestCase):
  1479. def test_basics(self):
  1480. s = u"abc123" # all codecs should be able to encode these
  1481. for encoding in all_unicode_encodings:
  1482. name = codecs.lookup(encoding).name
  1483. if encoding.endswith("_codec"):
  1484. name += "_codec"
  1485. elif encoding == "latin_1":
  1486. name = "latin_1"
  1487. self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
  1488. (bytes, size) = codecs.getencoder(encoding)(s)
  1489. self.assertEqual(size, len(s), "encoding=%r" % encoding)
  1490. (chars, size) = codecs.getdecoder(encoding)(bytes)
  1491. self.assertEqual(chars, s, "encoding=%r" % encoding)
  1492. if encoding not in broken_unicode_with_streams:
  1493. # check stream reader/writer
  1494. q = Queue()
  1495. writer = codecs.getwriter(encoding)(q)
  1496. encodedresult = ""
  1497. for c in s:
  1498. writer.write(c)
  1499. encodedresult += q.read()
  1500. q = Queue()
  1501. reader = codecs.getreader(encoding)(q)
  1502. decodedresult = u""
  1503. for c in encodedresult:
  1504. q.write(c)
  1505. decodedresult += reader.read()
  1506. self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
  1507. if encoding not in broken_incremental_coders:
  1508. # check incremental decoder/encoder and iterencode()/iterdecode()
  1509. try:
  1510. encoder = codecs.getincrementalencoder(encoding)()
  1511. except LookupError: # no IncrementalEncoder
  1512. pass
  1513. else:
  1514. # check incremental decoder/encoder
  1515. encodedresult = ""
  1516. for c in s:
  1517. encodedresult += encoder.encode(c)
  1518. encodedresult += encoder.encode(u"", True)
  1519. decoder = codecs.getincrementaldecoder(encoding)()
  1520. decodedresult = u""
  1521. for c in encodedresult:
  1522. decodedresult += decoder.decode(c)
  1523. decodedresult += decoder.decode("", True)
  1524. self.assertEqual(decodedresult, s,
  1525. "encoding=%r" % encoding)
  1526. # check iterencode()/iterdecode()
  1527. result = u"".join(codecs.iterdecode(
  1528. codecs.iterencode(s, encoding), encoding))
  1529. self.assertEqual(result, s, "encoding=%r" % encoding)
  1530. # check iterencode()/iterdecode() with empty string
  1531. result = u"".join(codecs.iterdecode(
  1532. codecs.iterencode(u"", encoding), encoding))
  1533. self.assertEqual(result, u"")
  1534. if encoding not in only_strict_mode:
  1535. # check incremental decoder/encoder with errors argument
  1536. try:
  1537. encoder = codecs.getincrementalencoder(encoding)("ignore")
  1538. except LookupError: # no IncrementalEncoder
  1539. pass
  1540. else:
  1541. encodedresult = "".join(encoder.encode(c) for c in s)
  1542. decoder = codecs.getincrementaldecoder(encoding)("ignore")
  1543. decodedresult = u"".join(decoder.decode(c)
  1544. for c in encodedresult)
  1545. self.assertEqual(decodedresult, s,
  1546. "encoding=%r" % encoding)
  1547. @test_support.cpython_only
  1548. def test_basics_capi(self):
  1549. from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
  1550. s = u"abc123" # all codecs should be able to encode these
  1551. for encoding in all_unicode_encodings:
  1552. if encoding not in broken_incremental_coders:
  1553. # check incremental decoder/encoder and iterencode()/iterdecode()
  1554. try:
  1555. cencoder = codec_incrementalencoder(encoding)
  1556. except LookupError: # no IncrementalEncoder
  1557. pass
  1558. else:
  1559. # check C API
  1560. encodedresult = ""
  1561. for c in s:
  1562. encodedresult += cencoder.encode(c)
  1563. encodedresult += cencoder.encode(u"", True)
  1564. cdecoder = codec_incrementaldecoder(encoding)
  1565. decodedresult = u""
  1566. for c in encodedresult:
  1567. decodedresult += cdecoder.decode(c)
  1568. decodedresult += cdecoder.decode("", True)
  1569. self.assertEqual(decodedresult, s,
  1570. "encoding=%r" % encoding)
  1571. if encoding not in only_strict_mode:
  1572. # check incremental decoder/encoder with errors argument
  1573. try:
  1574. cencoder = codec_incrementalencoder(encoding, "ignore")
  1575. except LookupError: # no IncrementalEncoder
  1576. pass
  1577. else:
  1578. encodedresult = "".join(cencoder.encode(c) for c in s)
  1579. cdecoder = codec_incrementaldecoder(encoding, "ignore")
  1580. decodedresult = u"".join(cdecoder.decode(c)
  1581. for c in encodedresult)
  1582. self.assertEqual(decodedresult, s,
  1583. "encoding=%r" % encoding)
  1584. def test_seek(self):
  1585. # all codecs should be able to encode these
  1586. s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
  1587. for encoding in all_unicode_encodings:
  1588. if encoding == "idna": # FIXME: See SF bug #1163178
  1589. continue
  1590. if encoding in broken_unicode_with_streams:
  1591. continue
  1592. reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
  1593. for t in xrange(5):
  1594. # Test that calling seek resets the internal codec state and buffers
  1595. reader.seek(0, 0)
  1596. line = reader.readline()
  1597. self.assertEqual(s[:len(line)], line)
  1598. def test_bad_decode_args(self):
  1599. for encoding in all_unicode_encodings:
  1600. decoder = codecs.getdecoder(encoding)
  1601. self.assertRaises(TypeError, decoder)
  1602. if encoding not in ("idna", "punycode"):
  1603. self.assertRaises(TypeError, decoder, 42)
  1604. def test_bad_encode_args(self):
  1605. for encoding in all_unicode_encodings:
  1606. encoder = codecs.getencoder(encoding)
  1607. self.assertRaises(TypeError, encoder)
  1608. def test_encoding_map_type_initialized(self):
  1609. from encodings import cp1140
  1610. # This used to crash, we are only verifying there's no crash.
  1611. table_type = type(cp1140.encoding_table)
  1612. self.assertEqual(table_type, table_type)
  1613. class BasicStrTest(unittest.TestCase):
  1614. def test_basics(self):
  1615. s = "abc123"
  1616. for encoding in all_string_encodings:
  1617. (bytes, size) = codecs.getencoder(encoding)(s)
  1618. self.assertEqual(size, len(s))
  1619. (chars, size) = codecs.getdecoder(encoding)(bytes)
  1620. self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
  1621. class CharmapTest(unittest.TestCase):
  1622. def test_decode_with_string_map(self):
  1623. self.assertEqual(
  1624. codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
  1625. (u"abc", 3)
  1626. )
  1627. self.assertRaises(UnicodeDecodeError,
  1628. codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
  1629. )
  1630. self.assertRaises(UnicodeDecodeError,
  1631. codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
  1632. )
  1633. self.assertEqual(
  1634. codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
  1635. (u"ab\ufffd", 3)
  1636. )
  1637. self.assertEqual(
  1638. codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
  1639. (u"ab\ufffd", 3)
  1640. )
  1641. self.assertEqual(
  1642. codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
  1643. (u"ab", 3)
  1644. )
  1645. self.assertEqual(
  1646. codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
  1647. (u"ab", 3)
  1648. )
  1649. allbytes = "".join(chr(i) for i in xrange(256))
  1650. self.assertEqual(
  1651. codecs.charmap_decode(allbytes, "ignore", u""),
  1652. (u"", len(allbytes))
  1653. )
  1654. def test_decode_with_int2str_map(self):
  1655. self.assertEqual(
  1656. codecs.charmap_decode("\x00\x01\x02", "strict",
  1657. {0: u'a', 1: u'b', 2: u'c'}),
  1658. (u"abc", 3)
  1659. )
  1660. self.assertEqual(
  1661. codecs.charmap_decode("\x00\x01\x02", "strict",
  1662. {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
  1663. (u"AaBbCc", 3)
  1664. )
  1665. self.assertEqual(
  1666. codecs.charmap_decode("\x00\x01\x02", "strict",
  1667. {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
  1668. (u"\U0010FFFFbc", 3)
  1669. )
  1670. self.assertEqual(
  1671. codecs.charmap_decode("\x00\x01\x02", "strict",
  1672. {0: u'a', 1: u'b', 2: u''}),
  1673. (u"ab", 3)
  1674. )
  1675. self.assertRaises(UnicodeDecodeError,
  1676. codecs.charmap_decode, "\x00\x01\x02", "strict",
  1677. {0: u'a', 1: u'b'}
  1678. )
  1679. self.assertRaises(UnicodeDecodeError,
  1680. codecs.charmap_decode, "\x00\x01\x02", "strict",
  1681. {0: u'a', 1: u'b', 2: None}
  1682. )
  1683. # Issue #14850
  1684. self.assertRaises(UnicodeDecodeError,
  1685. codecs.charmap_decode, "\x00\x01\x02", "strict",
  1686. {0: u'a', 1: u'b', 2: u'\ufffe'}
  1687. )
  1688. self.assertEqual(
  1689. codecs.charmap_decode("\x00\x01\x02", "replace",
  1690. {0: u'a', 1: u'b'}),
  1691. (u"ab\ufffd", 3)
  1692. )
  1693. self.assertEqual(
  1694. codecs.charmap_decode("\x00\x01\x02", "replace",
  1695. {0: u'a', 1: u'b', 2: None}),
  1696. (u"ab\ufffd", 3)
  1697. )
  1698. # Issue #14850
  1699. self.assertEqual(
  1700. codecs.charmap_decode("\x00\x01\x02", "replace",
  1701. {0: u'a', 1: u'b', 2: u'\ufffe'}),
  1702. (u"ab\ufffd", 3)
  1703. )
  1704. self.assertEqual(
  1705. codecs.charmap_decode("\x00\x01\x02", "ignore",
  1706. {0: u'a', 1: u'b'}),
  1707. (u"ab", 3)
  1708. )
  1709. self.assertEqual(
  1710. codecs.charmap_decode("\x00\x01\x02", "ignore",
  1711. {0: u'a', 1: u'b', 2: None}),
  1712. (u"ab", 3)
  1713. )
  1714. # Issue #14850
  1715. self.assertEqual(
  1716. codecs.charmap_decode("\x00\x01\x02", "ignore",
  1717. {0: u'a', 1: u'b', 2: u'\ufffe'}),
  1718. (u"ab", 3)
  1719. )
  1720. allbytes = "".join(chr(i) for i in xrange(256))
  1721. self.assertEqual(
  1722. codecs.charmap_decode(allbytes, "ignore", {}),
  1723. (u"", len(allbytes))
  1724. )
  1725. def test_decode_with_int2int_map(self):
  1726. a = ord(u'a')
  1727. b = ord(u'b')
  1728. c = ord(u'c')
  1729. self.assertEqual(
  1730. codecs.charmap_decode("\x00\x01\x02", "strict",
  1731. {0: a, 1: b, 2: c}),
  1732. (u"abc", 3)
  1733. )
  1734. # Issue #15379
  1735. self.assertEqual(
  1736. codecs.charmap_decode("\x00\x01\x02", "strict",
  1737. {0: 0x10FFFF, 1: b, 2: c}),
  1738. (u"\U0010FFFFbc", 3)
  1739. )
  1740. self.assertRaises(TypeError,
  1741. codecs.charmap_decode, "\x00\x01\x02", "strict",
  1742. {0: 0x110000, 1: b, 2: c}
  1743. )
  1744. self.assertRaises(UnicodeDecodeError,
  1745. codecs.charmap_decode, "\x00\x01\x02", "strict",
  1746. {0: a, 1: b},
  1747. )
  1748. self.assertRaises(UnicodeDecodeError,
  1749. codecs.charmap_decode, "\x00\x01\x02", "strict",
  1750. {0: a, 1: b, 2: 0xFFFE},
  1751. )
  1752. self.assertEqual(
  1753. codecs.charmap_decode("\x00\x01\x02", "replace",
  1754. {0: a, 1: b}),
  1755. (u"ab\ufffd", 3)
  1756. )
  1757. self.assertEqual(
  1758. codecs.charmap_decode("\x00\x01\x02", "replace",
  1759. {0: a, 1: b, 2: 0xFFFE}),
  1760. (u"ab\ufffd", 3)
  1761. )
  1762. self.assertEqual(
  1763. codecs.charmap_decode("\x00\x01\x02", "ignore",
  1764. {0: a, 1: b}),
  1765. (u"ab", 3)
  1766. )
  1767. self.assertEqual(
  1768. codecs.charmap_decode("\x00\x01\x02", "ignore",
  1769. {0: a, 1: b, 2: 0xFFFE}),
  1770. (u"ab", 3)
  1771. )
  1772. class WithStmtTest(unittest.TestCase):
  1773. def test_encodedfile(self):
  1774. f = StringIO.StringIO("\xc3\xbc")
  1775. with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
  1776. self.assertEqual(ef.read(), "\xfc")
  1777. def test_streamreaderwriter(self):
  1778. f = StringIO.StringIO("\xc3\xbc")
  1779. info = codecs.lookup("utf-8")
  1780. with codecs.StreamReaderWriter(f, info.streamreader,
  1781. info.streamwriter, 'strict') as srw:
  1782. self.assertEqual(srw.read(), u"\xfc")
  1783. class UnicodeEscapeTest(unittest.TestCase):
  1784. def test_empty(self):
  1785. self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
  1786. self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
  1787. def test_raw_encode(self):
  1788. encode = codecs.unicode_escape_encode
  1789. for b in range(32, 127):
  1790. if b != ord('\\'):
  1791. self.assertEqual(encode(unichr(b)), (chr(b), 1))
  1792. def test_raw_decode(self):
  1793. decode = codecs.unicode_escape_decode
  1794. for b in range(256):
  1795. if b != ord('\\'):
  1796. self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
  1797. def test_escape_encode(self):
  1798. encode = codecs.unicode_escape_encode
  1799. check = coding_checker(self, encode)
  1800. check(u'\t', r'\t')
  1801. check(u'\n', r'\n')
  1802. check(u'\r', r'\r')
  1803. check(u'\\', r'\\')
  1804. for b in range(32):
  1805. if chr(b) not in '\t\n\r':
  1806. check(unichr(b), '\\x%02x' % b)
  1807. for b in range(127, 256):
  1808. check(unichr(b), '\\x%02x' % b)
  1809. check(u'\u20ac', r'\u20ac')
  1810. check(u'\U0001d120', r'\U0001d120')
  1811. def test_escape_decode(self):
  1812. decode = codecs.unicode_escape_decode
  1813. check = coding_checker(self, decode)
  1814. check("[\\\n]", u"[]")
  1815. check(r'[\"]', u'["]')
  1816. check(r"[\']", u"[']")
  1817. check(r"[\\]", ur"[\]")
  1818. check(r"[\a]", u"[\x07]")
  1819. check(r"[\b]", u"[\x08]")
  1820. check(r"[\t]", u"[\x09]")
  1821. check(r"[\n]", u"[\x0a]")
  1822. check(r"[\v]", u"[\x0b]")
  1823. check(r"[\f]", u"[\x0c]")
  1824. check(r"[\r]", u"[\x0d]")
  1825. check(r"[\7]", u"[\x07]")
  1826. check(r"[\8]", ur"[\8]")
  1827. check(r"[\78]", u"[\x078]")
  1828. check(r"[\41]", u"[!]")
  1829. check(r"[\418]", u"[!8]")
  1830. check(r"[\101]", u"[A]")
  1831. check(r"[\1010]", u"[A0]")
  1832. check(r"[\x41]", u"[A]")
  1833. check(r"[\x410]", u"[A0]")
  1834. check(r"\u20ac", u"\u20ac")
  1835. check(r"\U0001d120", u"\U0001d120")
  1836. for b in range(256):
  1837. if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
  1838. check('\\' + chr(b), u'\\' + unichr(b))
  1839. def test_decode_errors(self):
  1840. decode = codecs.unicode_escape_decode
  1841. for c, d in ('x', 2), ('u', 4), ('U', 4):
  1842. for i in range(d):
  1843. self.assertRaises(UnicodeDecodeError, decode,
  1844. "\\" + c + "0"*i)
  1845. self.assertRaises(UnicodeDecodeError, decode,
  1846. "[\\" + c + "0"*i + "]")
  1847. data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
  1848. self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
  1849. self.assertEqual(decode(data, "replace"),
  1850. (u"[\ufffd]\ufffd", len(data)))
  1851. self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
  1852. self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
  1853. self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
  1854. class RawUnicodeEscapeTest(unittest.TestCase):
  1855. def test_empty(self):
  1856. self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
  1857. self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
  1858. def test_raw_encode(self):
  1859. encode = codecs.raw_unicode_escape_encode
  1860. for b in range(256):
  1861. self.assertEqual(encode(unichr(b)), (chr(b), 1))
  1862. def test_raw_decode(self):
  1863. decode = codecs.raw_unicode_escape_decode
  1864. for b in range(256):
  1865. self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
  1866. def test_escape_encode(self):
  1867. encode = codecs.raw_unicode_escape_encode
  1868. check = coding_checker(self, encode)
  1869. for b in range(256):
  1870. if chr(b) not in 'uU':
  1871. check(u'\\' + unichr(b), '\\' + chr(b))
  1872. check(u'\u20ac', r'\u20ac')
  1873. check(u'\U0001d120', r'\U0001d120')
  1874. def test_escape_decode(self):
  1875. decode = codecs.raw_unicode_escape_decode
  1876. check = coding_checker(self, decode)
  1877. for b in range(256):
  1878. if chr(b) not in 'uU':
  1879. check('\\' + chr(b), u'\\' + unichr(b))
  1880. check(r"\u20ac", u"\u20ac")
  1881. check(r"\U0001d120", u"\U0001d120")
  1882. def test_decode_errors(self):
  1883. decode = codecs.raw_unicode_escape_decode
  1884. for c, d in ('u', 4), ('U', 4):
  1885. for i in range(d):
  1886. self.assertRaises(UnicodeDecodeError, decode,
  1887. "\\" + c + "0"*i)
  1888. self.assertRaises(UnicodeDecodeError, decode,
  1889. "[\\" + c + "0"*i + "]")
  1890. data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
  1891. self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
  1892. self.assertEqual(decode(data, "replace"),
  1893. (u"[\ufffd]\ufffd", len(data)))
  1894. self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
  1895. self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
  1896. self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
  1897. class BomTest(unittest.TestCase):
  1898. def test_seek0(self):
  1899. data = u"1234567890"
  1900. tests = ("utf-16",
  1901. "utf-16-le",
  1902. "utf-16-be",
  1903. "utf-32",
  1904. "utf-32-le",
  1905. "utf-32-be")
  1906. self.addCleanup(test_support.unlink, test_support.TESTFN)
  1907. for encoding in tests:
  1908. # Check if the BOM is written only once
  1909. with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
  1910. f.write(data)
  1911. f.write(data)
  1912. f.seek(0)
  1913. self.assertEqual(f.read(), data * 2)
  1914. f.seek(0)
  1915. self.assertEqual(f.read(), data * 2)
  1916. # Check that the BOM is written after a seek(0)
  1917. with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
  1918. f.write(data[0])
  1919. self.assertNotEqual(f.tell(), 0)
  1920. f.seek(0)
  1921. f.write(data)
  1922. f.seek(0)
  1923. self.assertEqual(f.read(), data)
  1924. # (StreamWriter) Check that the BOM is written after a seek(0)
  1925. with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
  1926. f.writer.write(data[0])
  1927. self.assertNotEqual(f.writer.tell(), 0)
  1928. f.writer.seek(0)
  1929. f.writer.write(data)
  1930. f.seek(0)
  1931. self.assertEqual(f.read(), data)
  1932. # Check that the BOM is not written after a seek() at a position
  1933. # different than the start
  1934. with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
  1935. f.write(data)
  1936. f.seek(f.tell())
  1937. f.write(data)
  1938. f.seek(0)
  1939. self.assertEqual(f.read(), data * 2)
  1940. # (StreamWriter) Check that the BOM is not written after a seek()
  1941. # at a position different than the start
  1942. with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
  1943. f.writer.write(data)
  1944. f.writer.seek(f.writer.tell())
  1945. f.writer.write(data)
  1946. f.seek(0)
  1947. self.assertEqual(f.read(), data * 2)
  1948. class TransformCodecTest(unittest.TestCase):
  1949. def test_quopri_stateless(self):
  1950. # Should encode with quotetabs=True
  1951. encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
  1952. self.assertEqual(encoded, b"space=20tab=09eol=20\n")
  1953. # But should still support unescaped tabs and spaces
  1954. unescaped = b"space tab eol\n"
  1955. self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
  1956. def test_uu_invalid(self):
  1957. # Missing "begin" line
  1958. self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
  1959. def test_main():
  1960. test_support.run_unittest(
  1961. UTF32Test,
  1962. UTF32LETest,
  1963. UTF32BETest,
  1964. UTF16Test,
  1965. UTF16LETest,
  1966. UTF16BETest,
  1967. UTF8Test,
  1968. UTF8SigTest,
  1969. UTF7Test,
  1970. UTF16ExTest,
  1971. ReadBufferTest,
  1972. CharBufferTest,
  1973. EscapeDecodeTest,
  1974. RecodingTest,
  1975. PunycodeTest,
  1976. UnicodeInternalTest,
  1977. NameprepTest,
  1978. IDNACodecTest,
  1979. CodecsModuleTest,
  1980. StreamReaderTest,
  1981. EncodedFileTest,
  1982. Str2StrTest,
  1983. BasicUnicodeTest,
  1984. BasicStrTest,
  1985. CharmapTest,
  1986. WithStmtTest,
  1987. UnicodeEscapeTest,
  1988. RawUnicodeEscapeTest,
  1989. BomTest,
  1990. TransformCodecTest,
  1991. )
  1992. if __name__ == "__main__":
  1993. test_main()