gb18030_encoding.phpt 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. --TEST--
  2. Exhaustive test of verification and conversion of GB18030 text
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. include('encoding_tests.inc');
  12. srand(1111); // Make results consistent
  13. mb_substitute_character(0x25); // '%'
  14. readConversionTable(__DIR__ . '/data/GB18030-2byte.txt', $toUnicode, $fromUnicode);
  15. /* GB18030 represents all Unicode codepoints in the BMP which are _not_ covered by any
  16. * 2-byte GB18030 codepoint as a 4-byte code, with each of the 4 bytes in the following ranges:
  17. *
  18. * - 1st byte: 0x81-0x84
  19. * - 2nd byte: 0x30-0x39
  20. * - 3rd byte: 0x81-0xFE
  21. * - 4th byte: 0x30-0x39
  22. *
  23. * These start from 0x81308130 and count upwards one by one, with all the Unicode codepoints
  24. * which need to be represented as a 4-byte code appearing in sequence.
  25. *
  26. * Each subarray here is: [starting GB18030 codepoint bytes (4 of them), Unicode codepoint which it
  27. * converts to, number of sequential Unicode codepoints represented by sequential GB18030 codepoints] */
  28. $gb18030_BMP_Mappings = [
  29. [0x81, 0x30, 0x81, 0x30, 0x80, 36],
  30. [0x81, 0x30, 0x84, 0x36, 0xa5, 2],
  31. [0x81, 0x30, 0x84, 0x38, 0xa9, 7],
  32. [0x81, 0x30, 0x85, 0x35, 0xb2, 5],
  33. [0x81, 0x30, 0x86, 0x30, 0xb8, 31],
  34. [0x81, 0x30, 0x89, 0x31, 0xd8, 8],
  35. [0x81, 0x30, 0x89, 0x39, 0xe2, 6],
  36. [0x81, 0x30, 0x8a, 0x35, 0xeb, 1],
  37. [0x81, 0x30, 0x8a, 0x36, 0xee, 4],
  38. [0x81, 0x30, 0x8b, 0x30, 0xf4, 3],
  39. [0x81, 0x30, 0x8b, 0x33, 0xf8, 1],
  40. [0x81, 0x30, 0x8b, 0x34, 0xfb, 1],
  41. [0x81, 0x30, 0x8b, 0x35, 0xfd, 4],
  42. [0x81, 0x30, 0x8b, 0x39, 0x102, 17],
  43. [0x81, 0x30, 0x8d, 0x36, 0x114, 7],
  44. [0x81, 0x30, 0x8e, 0x33, 0x11c, 15],
  45. [0x81, 0x30, 0x8f, 0x38, 0x12c, 24],
  46. [0x81, 0x30, 0x92, 0x32, 0x145, 3],
  47. [0x81, 0x30, 0x92, 0x35, 0x149, 4],
  48. [0x81, 0x30, 0x92, 0x39, 0x14e, 29],
  49. [0x81, 0x30, 0x95, 0x38, 0x16c, 98],
  50. [0x81, 0x30, 0x9f, 0x36, 0x1cf, 1],
  51. [0x81, 0x30, 0x9f, 0x37, 0x1d1, 1],
  52. [0x81, 0x30, 0x9f, 0x38, 0x1d3, 1],
  53. [0x81, 0x30, 0x9f, 0x39, 0x1d5, 1],
  54. [0x81, 0x30, 0xa0, 0x30, 0x1d7, 1],
  55. [0x81, 0x30, 0xa0, 0x31, 0x1d9, 1],
  56. [0x81, 0x30, 0xa0, 0x32, 0x1db, 1],
  57. [0x81, 0x30, 0xa0, 0x33, 0x1dd, 28],
  58. [0x81, 0x30, 0xa3, 0x31, 0x1fa, 87],
  59. [0x81, 0x30, 0xab, 0x38, 0x252, 15],
  60. [0x81, 0x30, 0xad, 0x33, 0x262, 101],
  61. [0x81, 0x30, 0xb7, 0x34, 0x2c8, 1],
  62. [0x81, 0x30, 0xb7, 0x35, 0x2cc, 13],
  63. [0x81, 0x30, 0xb8, 0x38, 0x2da, 183],
  64. [0x81, 0x30, 0xcb, 0x31, 0x3a2, 1],
  65. [0x81, 0x30, 0xcb, 0x32, 0x3aa, 7],
  66. [0x81, 0x30, 0xcb, 0x39, 0x3c2, 1],
  67. [0x81, 0x30, 0xcc, 0x30, 0x3ca, 55],
  68. [0x81, 0x30, 0xd1, 0x35, 0x402, 14],
  69. [0x81, 0x30, 0xd2, 0x39, 0x450, 1],
  70. [0x81, 0x30, 0xd3, 0x30, 0x452, 7102],
  71. [0x81, 0x36, 0xa5, 0x32, 0x2011, 2],
  72. [0x81, 0x36, 0xa5, 0x34, 0x2017, 1],
  73. [0x81, 0x36, 0xa5, 0x35, 0x201a, 2],
  74. [0x81, 0x36, 0xa5, 0x37, 0x201e, 7],
  75. [0x81, 0x36, 0xa6, 0x34, 0x2027, 9],
  76. [0x81, 0x36, 0xa7, 0x33, 0x2031, 1],
  77. [0x81, 0x36, 0xa7, 0x34, 0x2034, 1],
  78. [0x81, 0x36, 0xa7, 0x35, 0x2036, 5],
  79. [0x81, 0x36, 0xa8, 0x30, 0x203c, 112],
  80. [0x81, 0x36, 0xb3, 0x32, 0x20ad, 86],
  81. [0x81, 0x36, 0xbb, 0x38, 0x2104, 1],
  82. [0x81, 0x36, 0xbb, 0x39, 0x2106, 3],
  83. [0x81, 0x36, 0xbc, 0x32, 0x210a, 12],
  84. [0x81, 0x36, 0xbd, 0x34, 0x2117, 10],
  85. [0x81, 0x36, 0xbe, 0x34, 0x2122, 62],
  86. [0x81, 0x36, 0xc4, 0x36, 0x216c, 4],
  87. [0x81, 0x36, 0xc5, 0x30, 0x217a, 22],
  88. [0x81, 0x36, 0xc7, 0x32, 0x2194, 2],
  89. [0x81, 0x36, 0xc7, 0x34, 0x219a, 110],
  90. [0x81, 0x36, 0xd2, 0x34, 0x2209, 6],
  91. [0x81, 0x36, 0xd3, 0x30, 0x2210, 1],
  92. [0x81, 0x36, 0xd3, 0x31, 0x2212, 3],
  93. [0x81, 0x36, 0xd3, 0x34, 0x2216, 4],
  94. [0x81, 0x36, 0xd3, 0x38, 0x221b, 2],
  95. [0x81, 0x36, 0xd4, 0x30, 0x2221, 2],
  96. [0x81, 0x36, 0xd4, 0x32, 0x2224, 1],
  97. [0x81, 0x36, 0xd4, 0x33, 0x2226, 1],
  98. [0x81, 0x36, 0xd4, 0x34, 0x222c, 2],
  99. [0x81, 0x36, 0xd4, 0x36, 0x222f, 5],
  100. [0x81, 0x36, 0xd5, 0x31, 0x2238, 5],
  101. [0x81, 0x36, 0xd5, 0x36, 0x223e, 10],
  102. [0x81, 0x36, 0xd6, 0x36, 0x2249, 3],
  103. [0x81, 0x36, 0xd6, 0x39, 0x224d, 5],
  104. [0x81, 0x36, 0xd7, 0x34, 0x2253, 13],
  105. [0x81, 0x36, 0xd8, 0x37, 0x2262, 2],
  106. [0x81, 0x36, 0xd8, 0x39, 0x2268, 6],
  107. [0x81, 0x36, 0xd9, 0x35, 0x2270, 37],
  108. [0x81, 0x36, 0xdd, 0x32, 0x2296, 3],
  109. [0x81, 0x36, 0xdd, 0x35, 0x229a, 11],
  110. [0x81, 0x36, 0xde, 0x36, 0x22a6, 25],
  111. [0x81, 0x36, 0xe1, 0x31, 0x22c0, 82],
  112. [0x81, 0x36, 0xe9, 0x33, 0x2313, 333],
  113. [0x81, 0x37, 0x8c, 0x36, 0x246a, 10],
  114. [0x81, 0x37, 0x8d, 0x36, 0x249c, 100],
  115. [0x81, 0x37, 0x97, 0x36, 0x254c, 4],
  116. [0x81, 0x37, 0x98, 0x30, 0x2574, 13],
  117. [0x81, 0x37, 0x99, 0x33, 0x2590, 3],
  118. [0x81, 0x37, 0x99, 0x36, 0x2596, 10],
  119. [0x81, 0x37, 0x9a, 0x36, 0x25a2, 16],
  120. [0x81, 0x37, 0x9c, 0x32, 0x25b4, 8],
  121. [0x81, 0x37, 0x9d, 0x30, 0x25be, 8],
  122. [0x81, 0x37, 0x9d, 0x38, 0x25c8, 3],
  123. [0x81, 0x37, 0x9e, 0x31, 0x25cc, 2],
  124. [0x81, 0x37, 0x9e, 0x33, 0x25d0, 18],
  125. [0x81, 0x37, 0xa0, 0x31, 0x25e6, 31],
  126. [0x81, 0x37, 0xa3, 0x32, 0x2607, 2],
  127. [0x81, 0x37, 0xa3, 0x34, 0x260a, 54],
  128. [0x81, 0x37, 0xa8, 0x38, 0x2641, 1],
  129. [0x81, 0x37, 0xa8, 0x39, 0x2643, 2110],
  130. [0x81, 0x38, 0xfd, 0x39, 0x2e82, 2],
  131. [0x81, 0x38, 0xfe, 0x31, 0x2e85, 3],
  132. [0x81, 0x38, 0xfe, 0x34, 0x2e89, 2],
  133. [0x81, 0x38, 0xfe, 0x36, 0x2e8d, 10],
  134. [0x81, 0x39, 0x81, 0x36, 0x2e98, 15],
  135. [0x81, 0x39, 0x83, 0x31, 0x2ea8, 2],
  136. [0x81, 0x39, 0x83, 0x33, 0x2eab, 3],
  137. [0x81, 0x39, 0x83, 0x36, 0x2eaf, 4],
  138. [0x81, 0x39, 0x84, 0x30, 0x2eb4, 2],
  139. [0x81, 0x39, 0x84, 0x32, 0x2eb8, 3],
  140. [0x81, 0x39, 0x84, 0x35, 0x2ebc, 14],
  141. [0x81, 0x39, 0x85, 0x39, 0x2ecb, 293],
  142. [0x81, 0x39, 0xa3, 0x32, 0x2ffc, 4],
  143. [0x81, 0x39, 0xa3, 0x36, 0x3004, 1],
  144. [0x81, 0x39, 0xa3, 0x37, 0x3018, 5],
  145. [0x81, 0x39, 0xa4, 0x32, 0x301f, 2],
  146. [0x81, 0x39, 0xa4, 0x34, 0x302a, 20],
  147. [0x81, 0x39, 0xa6, 0x34, 0x303f, 2],
  148. [0x81, 0x39, 0xa6, 0x36, 0x3094, 7],
  149. [0x81, 0x39, 0xa7, 0x33, 0x309f, 2],
  150. [0x81, 0x39, 0xa7, 0x35, 0x30f7, 5],
  151. [0x81, 0x39, 0xa8, 0x30, 0x30ff, 6],
  152. [0x81, 0x39, 0xa8, 0x36, 0x312a, 246],
  153. [0x81, 0x39, 0xc1, 0x32, 0x322a, 7],
  154. [0x81, 0x39, 0xc1, 0x39, 0x3232, 113],
  155. [0x81, 0x39, 0xcd, 0x32, 0x32a4, 234],
  156. [0x81, 0x39, 0xe4, 0x36, 0x3390, 12],
  157. [0x81, 0x39, 0xe5, 0x38, 0x339f, 2],
  158. [0x81, 0x39, 0xe6, 0x30, 0x33a2, 34],
  159. [0x81, 0x39, 0xe9, 0x34, 0x33c5, 9],
  160. [0x81, 0x39, 0xea, 0x33, 0x33cf, 2],
  161. [0x81, 0x39, 0xea, 0x35, 0x33d3, 2],
  162. [0x81, 0x39, 0xea, 0x37, 0x33d6, 113],
  163. [0x81, 0x39, 0xf6, 0x30, 0x3448, 43],
  164. [0x81, 0x39, 0xfa, 0x33, 0x3474, 298],
  165. [0x82, 0x30, 0x9a, 0x31, 0x359f, 111],
  166. [0x82, 0x30, 0xa5, 0x32, 0x360f, 11],
  167. [0x82, 0x30, 0xa6, 0x33, 0x361b, 765],
  168. [0x82, 0x30, 0xf2, 0x38, 0x3919, 85],
  169. [0x82, 0x30, 0xfb, 0x33, 0x396f, 96],
  170. [0x82, 0x31, 0x86, 0x39, 0x39d1, 14],
  171. [0x82, 0x31, 0x88, 0x33, 0x39e0, 147],
  172. [0x82, 0x31, 0x97, 0x30, 0x3a74, 218],
  173. [0x82, 0x31, 0xac, 0x38, 0x3b4f, 287],
  174. [0x82, 0x31, 0xc9, 0x35, 0x3c6f, 113],
  175. [0x82, 0x31, 0xd4, 0x38, 0x3ce1, 885],
  176. [0x82, 0x32, 0xaf, 0x33, 0x4057, 264],
  177. [0x82, 0x32, 0xc9, 0x37, 0x4160, 471],
  178. [0x82, 0x32, 0xf8, 0x38, 0x4338, 116],
  179. [0x82, 0x33, 0x86, 0x34, 0x43ad, 4],
  180. [0x82, 0x33, 0x86, 0x38, 0x43b2, 43],
  181. [0x82, 0x33, 0x8b, 0x31, 0x43de, 248],
  182. [0x82, 0x33, 0xa3, 0x39, 0x44d7, 373],
  183. [0x82, 0x33, 0xc9, 0x32, 0x464d, 20],
  184. [0x82, 0x33, 0xcb, 0x32, 0x4662, 193],
  185. [0x82, 0x33, 0xde, 0x35, 0x4724, 5],
  186. [0x82, 0x33, 0xdf, 0x30, 0x472a, 82],
  187. [0x82, 0x33, 0xe7, 0x32, 0x477d, 16],
  188. [0x82, 0x33, 0xe8, 0x38, 0x478e, 441],
  189. [0x82, 0x34, 0x96, 0x39, 0x4948, 50],
  190. [0x82, 0x34, 0x9b, 0x39, 0x497b, 2],
  191. [0x82, 0x34, 0x9c, 0x31, 0x497e, 4],
  192. [0x82, 0x34, 0x9c, 0x35, 0x4984, 1],
  193. [0x82, 0x34, 0x9c, 0x36, 0x4987, 20],
  194. [0x82, 0x34, 0x9e, 0x36, 0x499c, 3],
  195. [0x82, 0x34, 0x9e, 0x39, 0x49a0, 22],
  196. [0x82, 0x34, 0xa1, 0x31, 0x49b8, 703],
  197. [0x82, 0x34, 0xe7, 0x34, 0x4c78, 39],
  198. [0x82, 0x34, 0xeb, 0x33, 0x4ca4, 111],
  199. [0x82, 0x34, 0xf6, 0x34, 0x4d1a, 148],
  200. [0x82, 0x35, 0x87, 0x32, 0x4daf, 81],
  201. [0x82, 0x35, 0x8f, 0x33, 0x9fa6, 14426],
  202. [0x83, 0x36, 0xc7, 0x39, 0xe76c, 1],
  203. [0x83, 0x36, 0xc8, 0x30, 0xe7c8, 1],
  204. [0x83, 0x36, 0xc8, 0x31, 0xe7e7, 13],
  205. [0x83, 0x36, 0xc9, 0x34, 0xe815, 1],
  206. [0x83, 0x36, 0xc9, 0x35, 0xe819, 5],
  207. [0x83, 0x36, 0xca, 0x30, 0xe81f, 7],
  208. [0x83, 0x36, 0xca, 0x37, 0xe827, 4],
  209. [0x83, 0x36, 0xcb, 0x31, 0xe82d, 4],
  210. [0x83, 0x36, 0xcb, 0x35, 0xe833, 8],
  211. [0x83, 0x36, 0xcc, 0x33, 0xe83c, 7],
  212. [0x83, 0x36, 0xcd, 0x30, 0xe844, 16],
  213. [0x83, 0x36, 0xce, 0x36, 0xe856, 14],
  214. [0x83, 0x36, 0xd0, 0x30, 0xe865, 4295],
  215. [0x84, 0x30, 0x85, 0x35, 0xf92d, 76],
  216. [0x84, 0x30, 0x8d, 0x31, 0xf97a, 27],
  217. [0x84, 0x30, 0x8f, 0x38, 0xf996, 81],
  218. [0x84, 0x30, 0x97, 0x39, 0xf9e8, 9],
  219. [0x84, 0x30, 0x98, 0x38, 0xf9f2, 26],
  220. [0x84, 0x30, 0x9b, 0x34, 0xfa10, 1],
  221. [0x84, 0x30, 0x9b, 0x35, 0xfa12, 1],
  222. [0x84, 0x30, 0x9b, 0x36, 0xfa15, 3],
  223. [0x84, 0x30, 0x9b, 0x39, 0xfa19, 6],
  224. [0x84, 0x30, 0x9c, 0x35, 0xfa22, 1],
  225. [0x84, 0x30, 0x9c, 0x36, 0xfa25, 2],
  226. [0x84, 0x30, 0x9c, 0x38, 0xfa2a, 1030],
  227. [0x84, 0x31, 0x85, 0x38, 0xfe32, 1],
  228. [0x84, 0x31, 0x85, 0x39, 0xfe45, 4],
  229. [0x84, 0x31, 0x86, 0x33, 0xfe53, 1],
  230. [0x84, 0x31, 0x86, 0x34, 0xfe58, 1],
  231. [0x84, 0x31, 0x86, 0x35, 0xfe67, 1],
  232. [0x84, 0x31, 0x86, 0x36, 0xfe6c, 149],
  233. [0x84, 0x31, 0x95, 0x35, 0xff5f, 129],
  234. [0x84, 0x31, 0xa2, 0x34, 0xffe6, 26],
  235. ];
  236. // We will test 4-byte codes separately
  237. findInvalidChars($toUnicode, $invalid, $truncated);
  238. function notFourByteCode($gb) {
  239. return ((ord($gb) < 0x81 || ord($gb) > 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) ||
  240. (strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39));
  241. }
  242. $invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
  243. $truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
  244. testAllValidChars($toUnicode, 'GB18030', 'UTF-16BE', false);
  245. testAllInvalidChars($invalid, $toUnicode, 'GB18030', 'UTF-16BE', "\x00%");
  246. testTruncatedChars($truncated, 'GB18030', 'UTF-16BE', "\x00%");
  247. echo "Tested GB18030 (1 and 2 byte characters) -> UTF-16BE\n";
  248. // Test one random 4-byte code for each range used for Unicode codepoints in BMP
  249. function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) {
  250. return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30);
  251. }
  252. function fourByteCodeFromIndex($index) {
  253. $quotient = intdiv($index, 10 * 126 * 10);
  254. $byte4 = $quotient + 0x81;
  255. $index -= ($quotient * 10 * 126 * 10);
  256. $quotient = intdiv($index, 10 * 126);
  257. $byte3 = $quotient + 0x30;
  258. $index -= ($quotient * 10 * 126);
  259. $quotient = intdiv($index, 10);
  260. $byte2 = $quotient + 0x81;
  261. $byte1 = $index - ($quotient * 10) + 0x30;
  262. return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1);
  263. }
  264. foreach ($gb18030_BMP_Mappings as $mapping) {
  265. [$byte4, $byte3, $byte2, $byte1, $unicode, $n] = $mapping;
  266. $i = rand(0, $n-1);
  267. $gb = fourByteCodeFromIndex(fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) + $i);
  268. $unicode += $i;
  269. testValidString($gb, pack('n', $unicode), 'GB18030', 'UTF-16BE');
  270. }
  271. // Invalid 4-byte codes in range for BMP
  272. testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030", "UTF-32BE");
  273. testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030", "UTF-32BE");
  274. testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030", "UTF-32BE");
  275. testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030", "UTF-32BE");
  276. testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030", "UTF-32BE");
  277. // Valid 4-byte codes for other Unicode planes
  278. testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030", "UTF-32BE");
  279. testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030", "UTF-32BE");
  280. // Invalid 4-byte codes for other Unicode planes
  281. testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030", "UTF-32BE");
  282. testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030", "UTF-32BE");
  283. testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030", "UTF-32BE");
  284. testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030", "UTF-32BE");
  285. echo "Tested GB18030 4-byte characters <-> UTF-16BE\n";
  286. testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030', false);
  287. echo "Tested UTF-16BE -> GB18030 (1 and 2 byte characters)\n";
  288. // Test "long" illegal character markers
  289. mb_substitute_character("long");
  290. convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030", "UTF-8");
  291. convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030", "UTF-8");
  292. echo "Done!\n";
  293. ?>
  294. --EXPECT--
  295. Tested GB18030 (1 and 2 byte characters) -> UTF-16BE
  296. Tested GB18030 4-byte characters <-> UTF-16BE
  297. Tested UTF-16BE -> GB18030 (1 and 2 byte characters)
  298. Done!