iso2022jp_encoding.phpt 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. --TEST--
  2. Test of ASCII and JIS X 0201/0208/0212 support in ISO-2022-JP and JIS7/8 encodings
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. include('encoding_tests.inc');
  12. mb_substitute_character(0x25); // '%'
  13. /* Read in table of all characters in JISX-0212 charset */
  14. readConversionTable(__DIR__ . '/data/JISX0212.txt', $jisx0212Chars, $unused);
  15. /* Read in table of all characters in JISX-0208 charset */
  16. $jisx0208Chars = array(); /* JISX0208 -> UTF-16BE */
  17. $fp = fopen(__DIR__ . '/data/JISX0208.txt', 'r+');
  18. while ($line = fgets($fp, 256)) {
  19. if ($line[0] == '#')
  20. continue;
  21. if (sscanf($line, "0x%x\t0x%x\t0x%x", $shiftJIS, $jis0208Code, $unicodeCP) == 3) {
  22. $jisx0208Chars[pack('n', $jis0208Code)] = pack('n', $unicodeCP);
  23. }
  24. }
  25. /* Read in table of all characters in JISX-0201 charset */
  26. readConversionTable(__DIR__ . '/data/JISX0201.txt', $jisx0201Chars, $unused);
  27. /* The JIS X 0208 character set does not have a single, straightforward
  28. * mapping to the Unicode character set
  29. * mbstring converts one character differently from the mappings in
  30. * ../docs/JISX0208.txt, which comes from the Unicode Consortium */
  31. /* 0x2140 is a backslash; this can be mapped to 0x005C for an ordinary
  32. * backslash, or 0xFF3C for a _fullwidth_ one */
  33. $jisx0208Chars["\x21\x40"] = "\xFF\x3C";
  34. function testValid($from, $to, $encoding, $bothWays = true) {
  35. identifyValidString($from, $encoding);
  36. convertValidString($from, $to, $encoding, 'UTF-16BE', false);
  37. if ($bothWays) {
  38. /* An 0xF at the beginning of a JIS7 string is redundant; it switches
  39. * to ASCII mode, but ASCII mode is default */
  40. if ($from[0] == "\x0F")
  41. $from = substr($from, 1, strlen($from) - 1);
  42. /* ESC ( B at the beginning is redundant, since ASCII mode is the default */
  43. if (substr($from, 0, 3) == "\x1B(B")
  44. $from = substr($from, 3, strlen($from) - 3);
  45. /* If the string switches to a different charset, it should switch back to
  46. * ASCII at the end */
  47. if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false)
  48. $from .= "\x1B(B";
  49. convertValidString($to, $from, 'UTF-16BE', $encoding, false);
  50. }
  51. }
  52. function testInvalid($from, $to, $encoding) {
  53. testInvalidString($from, $to, $encoding, 'UTF-16BE');
  54. }
  55. for ($i = 0; $i < 0x80; $i++) {
  56. if ($i == 0xE || $i == 0xF || $i == 0x1B)
  57. continue;
  58. testValid(chr($i), "\x00" . chr($i), 'JIS');
  59. testValid("\x0F" . chr($i), "\x00" . chr($i), 'JIS'); /* 0xF is 'Shift Out' code */
  60. testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS');
  61. testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP');
  62. testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP');
  63. }
  64. for ($i = 0x80; $i < 256; $i++) {
  65. if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
  66. continue;
  67. testInvalid(chr($i), "\x00%", 'JIS');
  68. testInvalid("\x0F" . chr($i), "\x00%", 'JIS');
  69. testInvalid("\x1B(B" . chr($i), "\x00%", 'JIS');
  70. testInvalid(chr($i), "\x00%", 'ISO-2022-JP');
  71. testInvalid("\x1B(B" . chr($i), "\x00%", 'ISO-2022-JP');
  72. }
  73. echo "ASCII support OK\n";
  74. /* All valid JIS X 0201 characters
  75. * Those with a 1 in the high bit are JIS X 0201 kana; JIS7 encodes those
  76. * with a 0 in the high bit and treats them as a separate charset
  77. * (We don't test ISO-2022-JP here, as it does not support the JIS X 0201 charset) */
  78. foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
  79. if (ord($jisx0201) >= 128) {
  80. $kana = chr(ord($jisx0201) - 128);
  81. testValid("\x1B(I" . $kana, $utf16BE, 'JIS', false);
  82. testValid("\x0E" . $kana, $utf16BE, 'JIS', false); /* 0xE is 'Shift In' code */
  83. testValid($jisx0201, $utf16BE, 'JIS', false);
  84. } else {
  85. testValid("\x1B(J" . $jisx0201, $utf16BE, 'JIS', $utf16BE > "\x00\x80");
  86. }
  87. }
  88. for ($i = 0x80; $i < 256; $i++) {
  89. if ($i >= 0xA1 && $i <= 0xDF)
  90. continue;
  91. testInvalid("\x1B(I" . chr($i), "\x00%", 'JIS');
  92. testInvalid("\x1B(J" . chr($i), "\x00%", 'JIS');
  93. }
  94. echo "JIS X 0201 support OK\n";
  95. /* All valid JISX0208 characters */
  96. foreach ($jisx0208Chars as $jisx0208 => $utf16BE) {
  97. testValid("\x1B\$B" . $jisx0208, $utf16BE, 'JIS');
  98. testValid("\x1B\$B" . $jisx0208, $utf16BE, 'ISO-2022-JP');
  99. }
  100. /* All invalid 2-byte JISX0208 characters */
  101. for ($i = 0x21; $i <= 0x7E; $i++) {
  102. for ($j = 0; $j < 256; $j++) {
  103. $testString = chr($i) . chr($j);
  104. if (!isset($jisx0208Chars[$testString])) {
  105. testInvalid("\x1B\$B" . $testString, "\x00%", 'JIS');
  106. testInvalid("\x1B\$B" . $testString, "\x00%", 'ISO-2022-JP');
  107. }
  108. }
  109. }
  110. /* Try truncated JISX0208 characters */
  111. for ($i = 0x21; $i <= 0x7E; $i++) {
  112. testInvalid("\x1B\$B" . chr($i), "\x00%", 'JIS');
  113. testInvalid("\x1B\$B" . chr($i), "\x00%", 'ISO-2022-JP');
  114. }
  115. echo "JIS X 0208 support OK\n";
  116. /* JIS7 supports escape to switch to JIS X 0212 charset, but ISO-2022-JP does not */
  117. /* All valid JISX0212 characters */
  118. foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
  119. testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'JIS', false);
  120. }
  121. /* All invalid 2-byte JISX0212 characters */
  122. for ($i = 0x21; $i <= 0x7E; $i++) {
  123. for ($j = 0; $j < 256; $j++) {
  124. $testString = chr($i) . chr($j);
  125. if (!isset($jisx0212Chars[$testString])) {
  126. testInvalid("\x1B\$(D" . $testString, "\x00%", 'JIS');
  127. }
  128. }
  129. }
  130. /* Try truncated JISX0212 characters */
  131. for ($i = 0x21; $i <= 0x7E; $i++) {
  132. testInvalid("\x1B\$(D" . chr($i), "\x00%", 'JIS');
  133. }
  134. echo "JIS X 0212 support OK\n";
  135. /* All possible escape sequences */
  136. $validEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true];
  137. for ($i = 0; $i <= 0xFF; $i++) {
  138. for ($j = 0; $j <= 0xFF; $j++) {
  139. $escapeSequence = "\x1B" . chr($i) . chr($j);
  140. if ($escapeSequence === "\x1B\$(")
  141. continue;
  142. if (isset($validEscapes[$escapeSequence])) {
  143. testValid($escapeSequence, "", 'JIS', false);
  144. testValid($escapeSequence, "", 'ISO-2022-JP', false);
  145. } else {
  146. identifyInvalidString($escapeSequence, 'JIS');
  147. identifyInvalidString($escapeSequence, 'ISO-2022-JP');
  148. }
  149. }
  150. }
  151. for ($i = 0; $i <= 0xFF; $i++) {
  152. $escapeSequence = "\x1B\$(" . chr($i);
  153. if (isset($validEscapes[$escapeSequence])) {
  154. testValid($escapeSequence, "", 'JIS', false);
  155. testValid($escapeSequence, "", 'ISO-2022-JP', false);
  156. } else {
  157. identifyInvalidString($escapeSequence, 'JIS');
  158. identifyInvalidString($escapeSequence, 'ISO-2022-JP');
  159. }
  160. }
  161. echo "All escape sequences work as expected\n";
  162. foreach (['JIS', 'ISO-2022-JP'] as $encoding) {
  163. testValidString("\x22\x25", "\x1B\$B!B\x1B(B", 'UTF-16BE', $encoding, false);
  164. testValidString("\xFF\x0D", "\x1B\$B!]\x1B(B", 'UTF-16BE', $encoding, false);
  165. testValidString("\xFF\xE0", "\x1B\$B!q\x1B(B", 'UTF-16BE', $encoding, false);
  166. testValidString("\xFF\xE1", "\x1B\$B!r\x1B(B", 'UTF-16BE', $encoding, false);
  167. testValidString("\xFF\xE2", "\x1B\$B\"L\x1B(B", 'UTF-16BE', $encoding, false);
  168. testValidString("\x00\xA5", "\x1B(J\x5C\x1B(B", 'UTF-16BE', $encoding, false);
  169. }
  170. echo "Other mappings from Unicode -> ISO-2022-JP are OK\n";
  171. // Test "long" illegal character markers
  172. mb_substitute_character("long");
  173. convertInvalidString("\xE0", "%", "JIS", "UTF-8");
  174. convertInvalidString("\xE0", "%", "ISO-2022-JP", "UTF-8");
  175. convertInvalidString("\x1B\$(X", "%\$(X", "JIS", "UTF-8"); // Invalid escape
  176. convertInvalidString("\x1B\$(X", "%\$(X", "ISO-2022-JP", "UTF-8"); // Invalid escape
  177. convertInvalidString("\x1B\$B!", "%", "JIS", "UTF-8"); // Truncated character
  178. convertInvalidString("\x1B\$B!", "%", "ISO-2022-JP", "UTF-8"); // Truncated character
  179. echo "Done!\n";
  180. ?>
  181. --EXPECT--
  182. ASCII support OK
  183. JIS X 0201 support OK
  184. JIS X 0208 support OK
  185. JIS X 0212 support OK
  186. All escape sequences work as expected
  187. Other mappings from Unicode -> ISO-2022-JP are OK
  188. Done!