iso2022jp_kddi_encoding.phpt 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. --TEST--
  2. Exhaustive test of ISO-2022-JP-KDDI text encoding
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(390); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. function shiftJISDecode($bytes) {
  15. /* Convert CP932's default Shift-JIS representation to kuten code */
  16. $first = ($bytes >> 8) & 0xFF;
  17. $second = $bytes & 0xFF;
  18. $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81);
  19. if ($second > 0x9E) {
  20. $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21);
  21. } else if ($second > 0x7F) {
  22. $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21);
  23. } else {
  24. $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21);
  25. }
  26. return $kuten;
  27. }
  28. /* Read in the table of all characters in CP932 */
  29. $cp932Chars = array(); /* CP932 string -> UTF-32BE string */
  30. $fp = fopen(realpath(__DIR__ . '/data/CP932.txt'), 'r+');
  31. while ($line = fgets($fp, 256)) {
  32. if ($line[0] == '#')
  33. continue;
  34. if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
  35. if ($bytes < 256)
  36. continue;
  37. /* For ISO-2022-JP-KDDI, we only accept the first range of MicroSoft
  38. * vendor extensions, in ku 13 */
  39. if ($bytes > 0xEAA4)
  40. continue;
  41. $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('N', $codepoint);
  42. }
  43. }
  44. /* Add KDDI-specific emoji to the CP932 characters
  45. * They are mapped in 22 ku (or 'rows') above the places where they are mapped
  46. * in the Shift-JIS representation of KDDI emoji */
  47. $fp = fopen(realpath(__DIR__ . '/data/EmojiSources.txt'), 'r+');
  48. while ($line = fgets($fp, 256)) {
  49. if ($line[0] == '#')
  50. continue;
  51. $fields = explode(';', rtrim($line));
  52. if (count($fields) >= 4) {
  53. if (sscanf($fields[0], "%x %x", $cp1, $cp2) == 2)
  54. $utf32 = pack('N', $cp1) . pack('N', $cp2);
  55. else
  56. $utf32 = pack('N', hexdec($fields[0]));
  57. if ($fields[2]) {
  58. $kuten = shiftJISDecode(hexdec($fields[2]));
  59. $ku = $kuten >> 8;
  60. if ($ku >= 106 && $ku <= 112)
  61. $cp932Chars[pack('n', $kuten - (22 * 0x100))] = $utf32;
  62. }
  63. }
  64. }
  65. /* Duplicate mappings for the same characters in CP932 */
  66. $nonInvertible = array();
  67. foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) {
  68. $bytes = pack('n', shiftJISDecode($i));
  69. $nonInvertible[$bytes] = $cp932Chars[$bytes];
  70. }
  71. /* Read in table of all characters in JISX-0201 charset */
  72. $jisx0201Chars = array(); /* JISX0201 -> UTF-32BE */
  73. $fp = fopen(realpath(__DIR__ . '/data/JISX0201.txt'), 'r+');
  74. while ($line = fgets($fp, 256)) {
  75. if ($line[0] == '#')
  76. continue;
  77. if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2)
  78. $jisx0201Chars[chr($byte)] = pack('N', $codepoint);
  79. }
  80. function testValid($from, $to, $encoding, $bothWays = true) {
  81. identifyValidString($from, $encoding);
  82. convertValidString($from, $to, $encoding, 'UTF-32BE', false);
  83. if ($bothWays) {
  84. /* ESC ( B at the beginning is redundant, since ASCII mode is the default */
  85. if (substr($from, 0, 3) == "\x1B(B")
  86. $from = substr($from, 3, strlen($from) - 3);
  87. /* If the string switches to a different charset, it should switch back to
  88. * ASCII at the end */
  89. if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(I") !== false || strpos($from, "\x1B\$@") !== false || strpos($from, "\x1B\$(B") !== false)
  90. $from .= "\x1B(B";
  91. convertValidString($to, $from, 'UTF-32BE', $encoding, false);
  92. }
  93. }
  94. function testInvalid($from, $to, $encoding) {
  95. testInvalidString($from, $to, $encoding, 'UTF-32BE');
  96. }
  97. for ($i = 0; $i < 0x80; $i++) {
  98. if ($i == 0x1B)
  99. continue;
  100. testValid(chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI');
  101. testValid("\x1B(B" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI', false);
  102. testValid("\x1B(J" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI', false);
  103. }
  104. for ($i = 0x80; $i < 256; $i++) {
  105. if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
  106. continue;
  107. testInvalid(chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
  108. testInvalid("\x1B(B" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
  109. testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
  110. }
  111. echo "ASCII support OK\n";
  112. /* All valid JIS X 0201 characters
  113. * Those with a 1 in the high bit are JIS X 0201 kana */
  114. foreach ($jisx0201Chars as $jisx0201 => $utf32BE) {
  115. if (ord($jisx0201) >= 128) {
  116. $kana = chr(ord($jisx0201) - 128);
  117. testValid("\x1B(I" . $kana, $utf32BE, 'ISO-2022-JP-KDDI', false);
  118. testValid($jisx0201, $utf32BE, 'ISO-2022-JP-KDDI', false);
  119. }
  120. }
  121. for ($i = 0x80; $i < 256; $i++) {
  122. if ($i >= 0xA1 && $i <= 0xDF)
  123. continue;
  124. testInvalid("\x1B(I" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
  125. testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
  126. }
  127. echo "JIS X 0201 support OK\n";
  128. $validChars = $cp932Chars;
  129. /* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
  130. for ($i = 0; $i <= 0x7F; $i++)
  131. $validChars[chr($i)] = chr($i);
  132. for ($i = 0xA1; $i <= 0xDF; $i++)
  133. $validChars[chr($i)] = $jisx0201Chars[chr($i)];
  134. $lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
  135. findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
  136. foreach ($nonInvertible as $bytes => $char)
  137. unset($cp932Chars[$bytes]);
  138. $good = array_keys($cp932Chars);
  139. shuffle($good);
  140. while (!empty($good)) {
  141. $length = min(rand(5,10), count($good));
  142. $from = $to = '';
  143. while ($length--) {
  144. $goodChar = array_pop($good);
  145. $from .= $goodChar;
  146. $to .= $cp932Chars[$goodChar];
  147. }
  148. testValid("\x1B\$B" . $from, $to, 'ISO-2022-JP-KDDI');
  149. }
  150. $good = array_keys($nonInvertible);
  151. shuffle($good);
  152. while (!empty($good)) {
  153. $length = min(rand(5,10), count($good));
  154. $from = $to = '';
  155. while ($length--) {
  156. $goodChar = array_pop($good);
  157. $from .= $goodChar;
  158. $to .= $nonInvertible[$goodChar];
  159. }
  160. testValid("\x1B\$B" . $from, $to, 'ISO-2022-JP-KDDI', false);
  161. }
  162. foreach (array_keys($invalidChars) as $invalid) {
  163. $firstByte = ord($invalid[0]);
  164. if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) {
  165. /* The first byte of this 2-byte character will be rejected and result in % being sent
  166. * to the output. Then the second byte will do something else. It is easier to write the
  167. * test if we only check with the 1st byte. */
  168. testInvalidString("\x1B\$B" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
  169. } else {
  170. testInvalidString("\x1B\$B" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
  171. }
  172. }
  173. foreach (array_keys($truncatedChars) as $truncated)
  174. testInvalidString("\x1B\$B" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
  175. echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n";
  176. testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
  177. testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
  178. testValidString("\xFF\x5E", "\x1B\$B!A\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
  179. echo "Other mappings from Unicode -> ISO-2022-JP-KDDI OK\n";
  180. // Test "long" illegal character markers
  181. mb_substitute_character("long");
  182. convertInvalidString("\xE0", "%", "ISO-2022-JP-KDDI", "UTF-8");
  183. // Invalid escapes:
  184. convertInvalidString("\x1B", "%", "ISO-2022-JP-KDDI", "UTF-8");
  185. convertInvalidString("\x1B.", "%", "ISO-2022-JP-KDDI", "UTF-8");
  186. convertInvalidString("\x1B\$", "%", "ISO-2022-JP-KDDI", "UTF-8");
  187. convertInvalidString("\x1B\$.", "%", "ISO-2022-JP-KDDI", "UTF-8");
  188. convertInvalidString("\x1B\$(X", "%", "ISO-2022-JP-KDDI", "UTF-8");
  189. convertInvalidString("\x1B\$B\x9F", "%", "ISO-2022-JP-KDDI", "UTF-8"); // 0x9F does not start any 2-byte character
  190. convertInvalidString("\xE0\x00", "U+E000", "UTF-16BE", "ISO-2022-JP-KDDI");
  191. echo "Done!\n";
  192. ?>
  193. --EXPECT--
  194. ASCII support OK
  195. JIS X 0201 support OK
  196. JIS X 0208 (with MS extensions) and KDDI emoji support OK
  197. Other mappings from Unicode -> ISO-2022-JP-KDDI OK
  198. Done!