cp5022x_encoding.phpt 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. --TEST--
  2. Exhaustive test of CP50220, CP50221, and CP50222 encodings
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. include('encoding_tests.inc');
  12. mb_substitute_character(0x25); // '%'
  13. function shiftJISDecode($bytes) {
  14. /* Convert CP932's default Shift-JIS representation to kuten code
  15. *
  16. * Shift-JIS is fun! The first byte only represents the top 7 bits of
  17. * the ku number, because 94 first bytes were not available. There are
  18. * two different ranges of 94 which the second byte can fall in, and
  19. * we get the low bit of the ku number by seeing which one it is. */
  20. $first = ($bytes >> 8) & 0xFF;
  21. $second = $bytes & 0xFF;
  22. $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81);
  23. if ($second > 0x9E) {
  24. $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21);
  25. } else if ($second > 0x7F) {
  26. $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21);
  27. } else {
  28. $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21);
  29. }
  30. return $kuten;
  31. }
  32. /* Read in table of all characters in CP932 charset */
  33. $cp932Chars = array(); /* CP932 -> UTF-16BE */
  34. $nonInvertible = array();
  35. $fromUnicode = array();
  36. $fp = fopen(__DIR__ . '/data/CP932.txt', 'r+');
  37. while ($line = fgets($fp, 256)) {
  38. if ($line[0] == '#')
  39. continue;
  40. if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
  41. if ($bytes < 256)
  42. continue;
  43. if (isset($fromUnicode[$codepoint])) {
  44. $nonInvertible[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
  45. } else {
  46. $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
  47. $fromUnicode[$codepoint] = $bytes;
  48. }
  49. }
  50. }
  51. /* Aside from the characters in that table, we also support a 'user' area,
  52. * which maps to Unicode 'private' codepoints 0xE000-E757 */
  53. $codepoint = 0xE000;
  54. for ($i = 0xF0; $i <= 0xF9; $i++) {
  55. for ($j = 0x40; $j <= 0xFC; $j++) {
  56. if ($j == 0x7F)
  57. continue;
  58. $cp932Chars[pack('n', shiftJISDecode(($i << 8) + $j))] = pack('n', $codepoint);
  59. $codepoint++;
  60. }
  61. }
  62. /* Read in table of all characters in JISX-0201 charset */
  63. $jisx0201Chars = array(); /* JISX0201 -> UTF-16BE */
  64. $fp = fopen(__DIR__ . '/data/JISX0201.txt', 'r+');
  65. while ($line = fgets($fp, 256)) {
  66. if ($line[0] == '#')
  67. continue;
  68. if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2)
  69. $jisx0201Chars[chr($byte)] = pack('n', $codepoint);
  70. }
  71. /* Read in table of all characters in JISX-0212 charset */
  72. $jisx0212Chars = array();
  73. $fp = fopen(__DIR__ . '/data/JISX0212.txt', 'r+');
  74. while ($line = fgets($fp, 256)) {
  75. if ($line[0] == '#')
  76. continue;
  77. if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
  78. $jisx0212Chars[pack('n', $bytes)] = pack('n', $codepoint);
  79. }
  80. }
  81. /* Our conversions between CP5022x (when CP932 charset is selected) and Unicode
  82. * differ in a number of places from the table provided by the Unicode Consortium */
  83. $cp932Chars["\x21\x41"] = "\x30\x1C"; /* WAVE DASH instead of FULLWIDTH TILDE */
  84. $cp932Chars["\x21\x42"] = "\x20\x16"; /* DOUBLE VERTICAL LINE instead of PARALLEL TO */
  85. $cp932Chars["\x21\x5D"] = "\x22\x12"; /* MINUS SIGN instead of FULLWIDTH HYPHEN-MINUS */
  86. $cp932Chars["\x21\x71"] = "\x00\xA2"; /* CENT SIGN instead of FULLWIDTH CENT SIGN */
  87. $cp932Chars["\x21\x72"] = "\x00\xA3"; /* POUND SIGN instead of FULLWIDTH POUND SIGN */
  88. $cp932Chars["\x22\x4C"] = "\x00\xAC"; /* NOT SIGN instead of FULLWIDTH NOT SIGN */
  89. function testValid($from, $to, $encoding, $bothWays = true) {
  90. identifyValidString($from, $encoding);
  91. convertValidString($from, $to, $encoding, 'UTF-16BE', false);
  92. if ($bothWays) {
  93. /* An 0xF at the beginning is redundant; it switches to ASCII mode, but
  94. * ASCII mode is default */
  95. if ($from[0] == "\x0F")
  96. $from = substr($from, 1, strlen($from) - 1);
  97. /* ESC ( B at the beginning is redundant, since ASCII mode is the default */
  98. if (substr($from, 0, 3) == "\x1B(B")
  99. $from = substr($from, 3, strlen($from) - 3);
  100. /* If the string switches to a different charset, it should switch back to
  101. * ASCII at the end */
  102. if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false || strpos($from, "\x1B(I") !== false)
  103. $from .= "\x1B(B";
  104. if ($encoding == 'CP50222' && $from[0] == "\x0E")
  105. $from .= "\x0F";
  106. convertValidString($to, $from, 'UTF-16BE', $encoding, false);
  107. }
  108. }
  109. function testInvalid($from, $to, $encoding) {
  110. testInvalidString($from, $to, $encoding, 'UTF-16BE');
  111. }
  112. for ($i = 0; $i < 0x80; $i++) {
  113. if ($i == 0xE || $i == 0xF || $i == 0x1B)
  114. continue;
  115. testValid(chr($i), "\x00" . chr($i), 'CP50220');
  116. testValid(chr($i), "\x00" . chr($i), 'CP50221');
  117. testValid(chr($i), "\x00" . chr($i), 'CP50222');
  118. testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50220');
  119. testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50221');
  120. testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50222');
  121. testValid("\x0F" . chr($i), "\x00" . chr($i), 'CP50222', false); /* 0xF is 'Shift Out' code */
  122. }
  123. for ($i = 0x80; $i < 256; $i++) {
  124. if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
  125. continue;
  126. testInvalid(chr($i), "\x00%", 'CP50220');
  127. testInvalid(chr($i), "\x00%", 'CP50221');
  128. testInvalid(chr($i), "\x00%", 'CP50222');
  129. testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50220');
  130. testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50221');
  131. testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50222');
  132. testInvalid("\x0F" . chr($i), "\x00%", 'CP50220');
  133. testInvalid("\x0F" . chr($i), "\x00%", 'CP50221');
  134. testInvalid("\x0F" . chr($i), "\x00%", 'CP50222');
  135. }
  136. // Switch back to ASCII after a multibyte character
  137. convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50221', false);
  138. convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50222', false);
  139. echo "ASCII support OK\n";
  140. /* All valid JIS X 0201 characters
  141. * Those with a 1 in the high bit are JIS X 0201 kana */
  142. foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
  143. if (ord($jisx0201) >= 128) { /* Kana */
  144. $kana = chr(ord($jisx0201) - 128);
  145. testValid("\x1B(I" . $kana, $utf16BE, 'CP50221');
  146. testValid("\x1B(J\x0E" . $kana, $utf16BE, 'CP50222', false); /* 0xE is 'Shift In' code */
  147. testValid("\x0E" . $kana, $utf16BE, 'CP50222', false);
  148. testValid($jisx0201, $utf16BE, 'CP50220', false);
  149. testValid($jisx0201, $utf16BE, 'CP50221', false);
  150. testValid($jisx0201, $utf16BE, 'CP50222', false);
  151. convertValidString($utf16BE, "\x0E" . chr(ord($jisx0201) - 0x80) . "\x0F", 'UTF-16BE', 'CP50222', false);
  152. } else { /* Latin */
  153. testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50220', $utf16BE > "\x00\x80");
  154. testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50221', $utf16BE > "\x00\x80");
  155. testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50222', $utf16BE > "\x00\x80");
  156. }
  157. }
  158. for ($i = 0x80; $i < 256; $i++) {
  159. if ($i >= 0xA1 && $i <= 0xDF)
  160. continue;
  161. testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50220');
  162. testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50221');
  163. testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50222');
  164. testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50220');
  165. testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50221');
  166. testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50222');
  167. }
  168. /* Go from JIS X 0201 to ASCII or JIS X 0208 */
  169. convertValidString("\xFF\x61\x00A", "\x0E\x21\x0FA", 'UTF-16BE', 'CP50222', false);
  170. convertValidString("\xFF\x61\x22\x25", "\x0E\x21\x0F\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', 'CP50222', false);
  171. convertValidString("\xFF\x61\x20\x3E", "\x0E\x21\x0F\x1B(J\x7E\x1B(B", 'UTF-16BE', 'CP50222');
  172. echo "JIS X 0201 support OK\n";
  173. /* All valid CP932 characters */
  174. foreach ($cp932Chars as $cp932 => $utf16BE) {
  175. testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220');
  176. testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221');
  177. testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222');
  178. }
  179. foreach ($nonInvertible as $cp932 => $utf16BE) {
  180. testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220', false);
  181. testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221', false);
  182. testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222', false);
  183. }
  184. /* There are some conversions we support from Unicode -> CP5022x, but not in the opposite direction */
  185. foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) {
  186. convertValidString("\x22\x25", "\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', $encoding, false);
  187. convertValidString("\xFF\x0D", "\x1B\$B\x21\x5D\x1B(B", 'UTF-16BE', $encoding, false);
  188. convertValidString("\xFF\xE0", "\x1B\$B\x21\x71\x1B(B", 'UTF-16BE', $encoding, false);
  189. convertValidString("\xFF\xE1", "\x1B\$B\x21\x72\x1B(B", 'UTF-16BE', $encoding, false);
  190. convertValidString("\xFF\xE2", "\x1B\$B\x22\x4C\x1B(B", 'UTF-16BE', $encoding, false);
  191. }
  192. /* All invalid 2-byte CP932 characters */
  193. for ($i = 0x21; $i <= 0x97; $i++) {
  194. for ($j = 0; $j < 256; $j++) {
  195. $testString = chr($i) . chr($j);
  196. if (!isset($cp932Chars[$testString]) && !isset($nonInvertible[$testString])) {
  197. testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50220');
  198. testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50221');
  199. testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50222');
  200. }
  201. }
  202. }
  203. /* Try truncated 2-byte characters */
  204. for ($i = 0x21; $i <= 0x97; $i++) {
  205. testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50220');
  206. testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50221');
  207. testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50222');
  208. }
  209. /* Test alternative escape sequence to select CP932 */
  210. testValid("\x1B\$(B\x21\x21", "\x30\x00", 'CP50220', false);
  211. echo "CP932 support OK\n";
  212. foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
  213. testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50220', false);
  214. testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50221', false);
  215. testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50222', false);
  216. }
  217. for ($i = 0x21; $i <= 0x97; $i++) {
  218. for ($j = 0; $j < 256; $j++) {
  219. $testString = chr($i) . chr($j);
  220. if (!isset($jisx0212Chars[$testString])) {
  221. testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50220');
  222. testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50221');
  223. testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50222');
  224. }
  225. }
  226. }
  227. for ($i = 0x21; $i <= 0x97; $i++) {
  228. testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50220');
  229. testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50221');
  230. testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50222');
  231. }
  232. echo "JIS X 0212 support OK\n";
  233. /* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */
  234. $fullwidthKatakana = array(
  235. 0xFF61 => 0x2123, /* Ideographic full stop */
  236. 0xFF62 => 0x2156, /* Left corner bracket */
  237. 0xFF63 => 0x2157, /* Right corner bracket */
  238. 0xFF64 => 0x2122, /* Ideographic comma */
  239. 0xFF65 => 0x2126, /* Katakana middle dot */
  240. 0xFF66 => 0x2572, /* Wo */
  241. 0xFF67 => 0x2521, /* Small A */
  242. 0xFF68 => 0x2523, /* Small I */
  243. 0xFF69 => 0x2525, /* Small U */
  244. 0xFF6A => 0x2527, /* Small E */
  245. 0xFF6B => 0x2529, /* Small O */
  246. 0xFF6C => 0x2563, /* Small Ya */
  247. 0xFF6D => 0x2565, /* Small Yu */
  248. 0xFF6E => 0x2567, /* Small Yo */
  249. 0xFF6F => 0x2543, /* Small Tsu */
  250. 0xFF70 => 0x213C, /* Prolonged Sound Marker */
  251. 0xFF71 => 0x2522, /* A */
  252. 0xFF72 => 0x2524, /* I */
  253. 0xFF73 => 0x2526, /* U */
  254. 0xFF74 => 0x2528, /* E */
  255. 0xFF75 => 0x252A, /* O */
  256. 0xFF76 => 0x252B, /* Ka */
  257. 0xFF77 => 0x252D, /* Ki */
  258. 0xFF78 => 0x252F, /* Ku */
  259. 0xFF79 => 0x2531, /* Ke */
  260. 0xFF7A => 0x2533, /* Ko */
  261. 0xFF7B => 0x2535, /* Sa */
  262. 0xFF7C => 0x2537, /* Shi */
  263. 0xFF7D => 0x2539, /* Su */
  264. 0xFF7E => 0x253B, /* Se */
  265. 0xFF7F => 0x253D, /* So */
  266. 0xFF80 => 0x253F, /* Ta */
  267. 0xFF81 => 0x2541, /* Chi */
  268. 0xFF82 => 0x2544, /* Tsu */
  269. 0xFF83 => 0x2546, /* Te */
  270. 0xFF84 => 0x2548, /* To */
  271. 0xFF85 => 0x254A, /* Na */
  272. 0xFF86 => 0x254B, /* Ni */
  273. 0xFF87 => 0x254C, /* Nu */
  274. 0xFF88 => 0x254D, /* Ne */
  275. 0xFF89 => 0x254E, /* No */
  276. 0xFF8A => 0x254F, /* Ha */
  277. 0xFF8B => 0x2552, /* Hi */
  278. 0xFF8C => 0x2555, /* Fu */
  279. 0xFF8D => 0x2558, /* He */
  280. 0xFF8E => 0x255B, /* Ho */
  281. 0xFF8F => 0x255E, /* Ma */
  282. 0xFF90 => 0x255F, /* Mi */
  283. 0xFF91 => 0x2560, /* Mu */
  284. 0xFF92 => 0x2561, /* Me */
  285. 0xFF93 => 0x2562, /* Mo */
  286. 0xFF94 => 0x2564, /* Ya */
  287. 0xFF95 => 0x2566, /* Yu */
  288. 0xFF96 => 0x2568, /* Yo */
  289. 0xFF97 => 0x2569, /* Ra */
  290. 0xFF98 => 0x256A, /* Ri */
  291. 0xFF99 => 0x256B, /* Ru */
  292. 0xFF9A => 0x256C, /* Re */
  293. 0xFF9B => 0x256D, /* Ro */
  294. 0xFF9C => 0x256F, /* Wa */
  295. 0xFF9D => 0x2573, /* N */
  296. 0xFF9E => 0x212B, /* Voice Mark */
  297. 0xFF9F => 0x212C /* Semi-voice Mark */
  298. );
  299. foreach ($fullwidthKatakana as $cp => $kuten) {
  300. convertValidString(pack('n', $cp), "\x1B\$B" . pack('n', $kuten) . "\x1B(B", 'UTF-16BE', 'CP50220', false);
  301. }
  302. echo "Folding of fullwidth katakana for CP50220 OK\n";
  303. testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50220');
  304. testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50221');
  305. testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50222');
  306. echo "Invalid Unicode is flagged when converting to CP5022x\n";
  307. // Test "long" illegal character markers
  308. mb_substitute_character("long");
  309. convertInvalidString("\x80", "%", "CP50220", "UTF-8");
  310. convertInvalidString("\x80", "%", "CP50221", "UTF-8");
  311. convertInvalidString("\x80", "%", "CP50222", "UTF-8");
  312. convertInvalidString("\x1B\$B1", "%", "CP50220", "UTF-8");
  313. convertInvalidString("\x1B\$B1", "%", "CP50221", "UTF-8");
  314. convertInvalidString("\x1B\$B1", "%", "CP50222", "UTF-8");
  315. echo "Long error markers OK\n";
  316. ?>
  317. --EXPECT--
  318. ASCII support OK
  319. JIS X 0201 support OK
  320. CP932 support OK
  321. JIS X 0212 support OK
  322. Folding of fullwidth katakana for CP50220 OK
  323. Invalid Unicode is flagged when converting to CP5022x
  324. Long error markers OK