cp51932_encoding.phpt 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. --TEST--
  2. Exhaustive test of CP51932 encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(2020); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. /* Read in the table of all characters in CP51932 */
  15. $validChars = array(); /* CP51932 string -> UTF-16BE string */
  16. $fromUnicode = array();
  17. $fp = fopen(realpath(__DIR__ . '/data/CP51932.txt'), 'r+');
  18. while ($line = fgets($fp, 256)) {
  19. if ($line[0] == '#')
  20. continue;
  21. $byte2 = null;
  22. if (sscanf($line, '<U%x> \x%x\x%x', $codepoint, $byte1, $byte2) >= 2) {
  23. /* The table we are using tries to map as many Unicode codepoints into
  24. * CP51932 as possible, including by mapping latin characters with accents
  25. * to the equivalent without accents; but since CP51932 is based on the
  26. * CP932 character set, we don't need to handle codepoints which are not
  27. * mapped from any character in CP932 */
  28. if (($codepoint >= 0xC0 && $codepoint <= 0xD6) ||
  29. ($codepoint >= 0xD8 && $codepoint <= 0xF6) ||
  30. ($codepoint >= 0xF8 && $codepoint <= 0xFF))
  31. continue;
  32. $cp51932 = ($byte2 ? (chr($byte1) . chr($byte2)) : chr($byte1));
  33. $utf16 = pack('n', $codepoint);
  34. $validChars[$cp51932] = $utf16;
  35. $fromUnicode[$utf16] = $cp51932;
  36. }
  37. }
  38. /* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
  39. * But when converting Unicode to CP51932, we also accept U+301C (WAVE DASH) */
  40. $fromUnicode["\x30\x1C"] = "\xA1\xC1";
  41. /* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),
  42. * but when converting Unicode to CP51932, we also accept U+2212 (MINUS SIGN) */
  43. $fromUnicode["\x22\x12"] = "\xA1\xDD";
  44. /* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),
  45. * but when converting Unicode to CP51932, we also accept U+2016
  46. * (DOUBLE VERTICAL LINE) */
  47. $fromUnicode["\x20\x16"] = "\xA1\xC2";
  48. /* There are a number of duplicate, irreversible mappings in the CP51932 table
  49. * In most cases, the one which we primarily use appears last in the table,
  50. * but in some cases, it is first and will be overwritten in the above loop
  51. *
  52. * Interestingly, the "collisions" happen in both directions! Part of this is
  53. * because the table we are using attempts to map as many Unicode codepoints
  54. * as possible to CP932 characters */
  55. $fromUnicode["\x22\x20"] = "\xA2\xDC";
  56. $fromUnicode["\x22\x29"] = "\xA2\xC1";
  57. $fromUnicode["\x22\x2B"] = "\xA2\xE9";
  58. $fromUnicode["\x22\x35"] = "\xA2\xE8";
  59. $fromUnicode["\x22\x1A"] = "\xA2\xE5";
  60. $fromUnicode["\x22\x2A"] = "\xA2\xC0";
  61. $fromUnicode["\x22\x61"] = "\xA2\xE1";
  62. $fromUnicode["\x22\xA5"] = "\xA2\xDD";
  63. $fromUnicode["\x22\x52"] = "\xA2\xE2";
  64. $fromUnicode["\xFF\xE2"] = "\xA2\xCC";
  65. unset($fromUnicode["\x00\xA1"]); // Don't map upside-down ! to ordinary !
  66. unset($fromUnicode["\x00\xA6"]); // Don't map broken bar to ordinary pipe character
  67. unset($fromUnicode["\x00\xA9"]); // Don't map © to c
  68. unset($fromUnicode["\x00\xAA"]); // Don't map feminine ordinal indicator
  69. unset($fromUnicode["\x00\xAB"]); // Don't map left double angled quote mark to "much less than"
  70. unset($fromUnicode["\x00\xAD"]); // Don't map soft hyphen to ordinary hyphen
  71. unset($fromUnicode["\x00\xAE"]); // Don't map ® to R
  72. unset($fromUnicode["\x00\xAF"]); // Don't map Unicode halfwidth macron to CP932 fullwidth macron
  73. unset($fromUnicode["\x00\xB2"]); // Don't map ² to ordinary 2
  74. unset($fromUnicode["\x00\xB3"]); // Don't map ³ to ordinary 3
  75. unset($fromUnicode["\x00\xB5"]); // Don't map micro sign to Greek mu
  76. unset($fromUnicode["\x00\xB7"]); // Don't map middle dot to katakana middle dot
  77. unset($fromUnicode["\x00\xB8"]); // Don't map cedilla to fullwidth comma
  78. unset($fromUnicode["\x00\xB9"]); // Don't map ¹ to ordinary 1
  79. unset($fromUnicode["\x00\xBA"]); // Don't map "masculine ordinal indicator"
  80. unset($fromUnicode["\x00\xBB"]); // Don't map right double angled quote mark to "much greater than"
  81. unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu
  82. for ($i = 0; $i <= 0x7F; $i++)
  83. $validChars[chr($i)] = "\x00" . chr($i);
  84. /* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */
  85. $fromUnicode["\x00\xA5"] = "\xA1\xEF";
  86. /* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
  87. $fromUnicode["\x20\x3E"] = "\xA1\xB1";
  88. /* U+00AF is MACRON; convert to FULLWIDTH MACRON */
  89. $fromUnicode["\x00\xAF"] = "\xA1\xB1";
  90. testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false);
  91. testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false);
  92. echo "CP51932 verification and conversion works on all valid characters\n";
  93. findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA9, 0xAF), 2) + array_fill_keys(range(0xF5, 0xF8), 2) + array(0xFD => 2, 0xFE => 2));
  94. testAllInvalidChars($invalidChars, $validChars, 'CP51932', 'UTF-16BE', "\x00%");
  95. testTruncatedChars($truncated, 'CP51932', 'UTF-16BE', "\x00%");
  96. echo "CP51932 verification and conversion works on all invalid characters\n";
  97. findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
  98. convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%');
  99. echo "Unicode -> CP51932 conversion works on all invalid codepoints\n";
  100. // Test "long" illegal character markers
  101. mb_substitute_character("long");
  102. convertInvalidString("\x80", "%", "CP51932", "UTF-8");
  103. convertInvalidString("\xFE\xFF", "%", "CP51932", "UTF-8");
  104. echo "Done!\n";
  105. ?>
  106. --EXPECT--
  107. CP51932 verification and conversion works on all valid characters
  108. CP51932 verification and conversion works on all invalid characters
  109. Unicode -> CP51932 conversion works on all invalid codepoints
  110. Done!