sjismac_encoding.phpt 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. --TEST--
  2. Exhaustive test of MacJapanese encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(300); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. /* Read in the table of all characters in MacJapanese */
  15. $validChars = array(); /* MacJapanese string -> UTF-32BE string */
  16. $fromUnicode = array(); /* UTF-16BE -> MacJapanese */
  17. $fp = fopen(__DIR__ . '/data/MacJapanese-SJIS.txt', 'r+');
  18. while ($line = fgets($fp, 256)) {
  19. if ($line[0] == '#')
  20. continue;
  21. $cp1 = $cp2 = $cp3 = $cp4 = $cp5 = null;
  22. if (sscanf($line, "0x%x\t0x%x+0x%x+0x%x+0x%x+0x%x", $bytes, $cp1, $cp2, $cp3, $cp4, $cp5) >= 2) {
  23. if ($bytes < 256) {
  24. $macJap = chr($bytes);
  25. } else {
  26. $macJap = pack('n', $bytes);
  27. }
  28. if ($cp5) {
  29. $validChars[$macJap] = pack('NNNNN', $cp1, $cp2, $cp3, $cp4, $cp5);
  30. $fromUnicode[pack('nnnnn', $cp1, $cp2, $cp3, $cp4, $cp5)] = $macJap;
  31. } else if ($cp4) {
  32. $validChars[$macJap] = pack('NNNN', $cp1, $cp2, $cp3, $cp4);
  33. $fromUnicode[pack('nnnn', $cp1, $cp2, $cp3, $cp4)] = $macJap;
  34. } else if ($cp3) {
  35. $validChars[$macJap] = pack('NNN', $cp1, $cp2, $cp3);
  36. $fromUnicode[pack('nnn', $cp1, $cp2, $cp3)] = $macJap;
  37. } else if ($cp2) {
  38. $validChars[$macJap] = pack('NN', $cp1, $cp2);
  39. $fromUnicode[pack('nn', $cp1, $cp2)] = $macJap;
  40. } else {
  41. $validChars[$macJap] = pack('N', $cp1);
  42. $fromUnicode[pack('n', $cp1)] = $macJap;
  43. }
  44. }
  45. }
  46. /* Although not included in the table, 0x0-0x1F and 0x7F are valid;
  47. * these are 'control characters' */
  48. for ($i = 0; $i < 0x20; $i++) {
  49. $validChars[chr($i)] = pack('N', $i);
  50. $fromUnicode[pack('n', $i)] = chr($i);
  51. }
  52. $validChars["\x7F"] = pack('N', 0x7F);
  53. $fromUnicode["\x00\x7F"] = "\x7F";
  54. /* While Shift-JIS 0x815C normally corresponds to U+2015 (HORIZONTAL BAR),
  55. * for MacJapanese we convert 0x815C to U+2014 (EM DASH)
  56. * (See recommendations in JAPANESE.txt from the Unicode Consortium, under
  57. * 'Unicode mapping issues', point 3)
  58. * However, when converting Unicode -> MacJapanese, we accept both U+2014
  59. * and U+2015 */
  60. $fromUnicode["\x20\x15"] = "\x81\x5C";
  61. /* Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) */
  62. $fromUnicode["\x20\x3E"] = "\x81\x50";
  63. /* And also U+00AF (MACRON) */
  64. $fromUnicode["\x00\xAF"] = "\x81\x50";
  65. /* Convert U+FF5E (FULLWIDTH TILDE) to 0x8160 (WAVE DASH) */
  66. $fromUnicode["\xFF\x5E"] = "\x81\x60";
  67. testAllValidChars($validChars, 'SJIS-mac', 'UTF-32BE');
  68. echo "MacJapanese verification and conversion works on all valid characters\n";
  69. findInvalidChars($validChars, $invalidChars, $truncated,
  70. array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xED), 2));
  71. testAllInvalidChars($invalidChars, $validChars, 'SJIS-mac', 'UTF-32BE', "\x00\x00\x00%");
  72. testTruncatedChars($truncated, 'SJIS-mac', 'UTF-32BE', "\x00\x00\x00%");
  73. echo "MacJapanese verification and conversion rejects all invalid characters\n";
  74. testAllValidChars($fromUnicode, 'UTF-16BE', 'SJIS-mac', false);
  75. echo "Unicode -> SJIS-mac conversion works on all valid characters\n";
  76. findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
  77. convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-mac', '%');
  78. echo "Unicode -> SJIS-mac conversion works on all invalid characters\n";
  79. // Test "long" illegal character markers
  80. mb_substitute_character("long");
  81. convertInvalidString("\x81", "%", "SJIS-mac", "UTF-8");
  82. convertInvalidString("\x81\x20", "%", "SJIS-mac", "UTF-8");
  83. convertInvalidString("\xED\x9F", "%", "SJIS-mac", "UTF-8");
  84. echo "Done!\n";
  85. ?>
  86. --EXPECT--
  87. MacJapanese verification and conversion works on all valid characters
  88. MacJapanese verification and conversion rejects all invalid characters
  89. Unicode -> SJIS-mac conversion works on all valid characters
  90. Unicode -> SJIS-mac conversion works on all invalid characters
  91. Done!