cp932_encoding.phpt 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. --TEST--
  2. Exhaustive test of CP932 encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(4321); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. /* Read in the table of all characters in CP932 */
  15. readConversionTable(__DIR__ . '/data/CP932.txt', $validChars, $fromUnicode);
  16. /* Aside from the characters in that table, we also support a 'user' area
  17. * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */
  18. $codepoint = 0xE000;
  19. for ($i = 0xF0; $i <= 0xF9; $i++) {
  20. for ($j = 0x40; $j <= 0xFC; $j++) {
  21. if ($j == 0x7F)
  22. continue;
  23. $utf16 = pack('n', $codepoint);
  24. $cp932 = chr($i) . chr($j);
  25. $validChars[$cp932] = $utf16;
  26. $fromUnicode[$utf16] = $cp932;
  27. $codepoint++;
  28. }
  29. }
  30. /* U+00A2 is CENT SIGN; convert to FULLWIDTH CENT SIGN */
  31. $fromUnicode["\x00\xA2"] = "\x81\x91";
  32. /* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */
  33. $fromUnicode["\x00\xA3"] = "\x81\x92";
  34. /* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */
  35. $fromUnicode["\x00\xA5"] = "\x81\x8F";
  36. /* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
  37. * But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */
  38. $fromUnicode["\x30\x1C"] = "\x81\x60";
  39. /* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),
  40. * but when converting Unicode to CP932, we also accept U+2212 (MINUS SIGN) */
  41. $fromUnicode["\x22\x12"] = "\x81\x7C";
  42. /* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),
  43. * but when converting Unicode to CP932, we also accept U+2016
  44. * (DOUBLE VERTICAL LINE) */
  45. $fromUnicode["\x20\x16"] = "\x81\x61";
  46. /* We map the JIS X 0208 NOT SIGN to U+FFE2 (FULLWIDTH NOT SIGN),
  47. * but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */
  48. $fromUnicode["\x00\xAC"] = "\x81\xCA";
  49. /* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */
  50. $fromUnicode["\x20\x3E"] = "\x81\x50";
  51. /* U+00AF is MACRON; it can also go to FULLWIDTH MACRON */
  52. $fromUnicode["\x00\xAF"] = "\x81\x50";
  53. findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));
  54. findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
  55. /* There are 396 Unicode codepoints which are non-invertible in CP932
  56. * (multiple CP932 byte sequences map to the same codepoint)
  57. * Some of these are 3-way pile-ups. I wonder what the fine folks at MS
  58. * were thinking when they designed this text encoding. */
  59. /* Everything from 0xED00-0xEEFF falls in this unfortunate category
  60. * (Other sequences in 0xFA00-0xFBFF map to the same codepoints, and when
  61. * converting from Unicode back to CP932, we favor the F's rather than the E's) */
  62. $nonInvertible = array();
  63. for ($i = 0xED00; $i <= 0xEEFF; $i++) {
  64. $bytes = pack('n', $i);
  65. if (isset($validChars[$bytes])) {
  66. unset($fromUnicode[$validChars[$bytes]]);
  67. $nonInvertible[$bytes] = $validChars[$bytes];
  68. unset($validChars[$bytes]); // will test these separately
  69. }
  70. }
  71. /* There are 23 other collisions between 2-byte sequences which variously
  72. * start with 0x81, 0x87, or 0xFA
  73. * We _love_ 0x81 and use it when possible. 0x87 is a second favorite */
  74. for ($i = 0xFA4A; $i <= 0xFA53; $i++) {
  75. $bytes = pack('n', $i);
  76. unset($fromUnicode[$validChars[$bytes]]);
  77. $nonInvertible[$bytes] = $validChars[$bytes];
  78. unset($validChars[$bytes]); // will test these separately
  79. }
  80. foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xFA54, 0xFA58, 0xFA59, 0xFA5A, 0xFA5B] as $i) {
  81. $bytes = pack('n', $i);
  82. unset($fromUnicode[$validChars[$bytes]]);
  83. $nonInvertible[$bytes] = $validChars[$bytes];
  84. unset($validChars[$bytes]); // will test these separately
  85. }
  86. testAllValidChars($validChars, 'CP932', 'UTF-16BE');
  87. foreach ($nonInvertible as $cp932 => $unicode)
  88. testValidString($cp932, $unicode, 'CP932', 'UTF-16BE', false);
  89. echo "CP932 verification and conversion works on all valid characters\n";
  90. testAllInvalidChars($invalidChars, $validChars, 'CP932', 'UTF-16BE', "\x00%");
  91. echo "CP932 verification and conversion works on all invalid characters\n";
  92. convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%');
  93. echo "Unicode -> CP932 conversion works on all invalid codepoints\n";
  94. // Test "long" illegal character markers
  95. mb_substitute_character("long");
  96. convertInvalidString("\x80", "%", "CP932", "UTF-8");
  97. convertInvalidString("\xEA", "%", "CP932", "UTF-8");
  98. convertInvalidString("\x81\x20", "%", "CP932", "UTF-8");
  99. convertInvalidString("\xEA\xA9", "%", "CP932", "UTF-8");
  100. echo "Done!\n";
  101. ?>
  102. --EXPECT--
  103. CP932 verification and conversion works on all valid characters
  104. CP932 verification and conversion works on all invalid characters
  105. Unicode -> CP932 conversion works on all invalid codepoints
  106. Done!