sjis_encoding.phpt 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. --TEST--
  2. Exhaustive test of Shift-JIS encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(999); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. /* Read in the table of all characters in Shift-JIS */
  15. readConversionTable(__DIR__ . '/data/SHIFTJIS.txt', $validChars, $fromUnicode);
  16. for ($i = 0; $i < 0x20; $i++) {
  17. $validChars[chr($i)] = "\x00" . chr($i);
  18. $fromUnicode["\x00" . chr($i)] = chr($i);
  19. }
  20. /* According to the relevant Japan Industrial Standards Committee standards,
  21. * SJIS 0x5C is a Yen sign, and 0x7E is an overline.
  22. *
  23. * However, this conflicts with the implementation of SJIS in various legacy
  24. * software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken
  25. * as equivalent to the same ASCII bytes.
  26. *
  27. * Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes
  28. * compatibly with Microsoft products. This was changed in PHP 8.1.0, in an
  29. * attempt to comply with the JISC specifications. However, after discussion
  30. * with various concerned Japanese developers, it seems that the historical
  31. * behavior was more useful in the majority of applications which process
  32. * SJIS-encoded text. */
  33. $validChars["\x5C"] = "\x00\x5C";
  34. $validChars["\x7E"] = "\x00\x7E";
  35. $fromUnicode["\x00\x5C"] = "\x5C";
  36. $fromUnicode["\x00\x7E"] = "\x7E";
  37. /* That means it does not make sense to convert U+203E (OVERLINE)
  38. * to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */
  39. $fromUnicode["\x20\x3E"] = "\x81\x50";
  40. /* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */
  41. $fromUnicode["\x00\xAF"] = "\x81\x50";
  42. /* Since we are treating 0x5C as equivalent to U+005C, it does not
  43. * make sense to convert U+00A5 (YEN SIGN) to 0x5C
  44. * Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */
  45. $fromUnicode["\x00\xA5"] = "\x81\x8F";
  46. /* DEL character */
  47. $validChars["\x7F"] = "\x00\x7F";
  48. $fromUnicode["\x00\x7F"] = "\x7F";
  49. /* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */
  50. $validChars["\x81\x5F"] = "\xFF\x3C";
  51. $fromUnicode["\xFF\x3C"] = "\x81\x5F";
  52. /* Unicode has both halfwidth and fullwidth NOT SIGN; convert both of them
  53. * to JIS X 0208 NOT SIGN */
  54. $fromUnicode["\xFF\xE2"] = "\x81\xCA";
  55. /* Likewise for fullwidth and halfwidth POUND SIGN */
  56. $fromUnicode["\xFF\xE1"] = "\x81\x92";
  57. /* Likewise for fullwidth and halfwidth CENT SIGN */
  58. $fromUnicode["\xFF\xE0"] = "\x81\x91";
  59. /* Convert Unicode FULLWIDTH TILDE to JIS X 0208 WAVE DASH */
  60. $fromUnicode["\xFF\x5E"] = "\x81\x60";
  61. /* Convert Unicode FULLWIDTH HYPHEN-MINUS to JIS X 0208 MINUS SIGN */
  62. $fromUnicode["\xFF\x0D"] = "\x81\x7C";
  63. /* Convert Unicode PARALLEL TO to JIS X 0208 DOUBLE VERTICAL LINE */
  64. $fromUnicode["\x22\x25"] = "\x81\x61";
  65. testAllValidChars($validChars, 'Shift-JIS', 'UTF-16BE');
  66. echo "SJIS verification and conversion works on all valid characters\n";
  67. findInvalidChars($validChars, $invalidChars, $truncated,
  68. array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xEF), 2));
  69. testAllInvalidChars($invalidChars, $validChars, 'Shift-JIS', 'UTF-16BE', "\x00%");
  70. testTruncatedChars($truncated, 'Shift-JIS', 'UTF-16BE', "\x00%");
  71. echo "SJIS verification and conversion works on all invalid characters\n";
  72. testAllValidChars($fromUnicode, 'UTF-16BE', 'Shift-JIS', false);
  73. echo "Unicode -> SJIS conversion works on all valid characters\n";
  74. findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
  75. convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'Shift-JIS', '%');
  76. echo "Unicode -> SJIS conversion works on all invalid characters\n";
  77. testValidString("\xFF\x5E", "\x81\x60", 'UTF-16BE', 'SJIS', false);
  78. echo "Other mappings from Unicode -> SJIS are OK\n";
  79. // Test "long" illegal character markers
  80. mb_substitute_character("long");
  81. convertInvalidString("\x80", "%", "Shift-JIS", "UTF-8");
  82. convertInvalidString("\x81\x20", "%", "Shift-JIS", "UTF-8");
  83. convertInvalidString("\xEA\xA9", "%", "Shift-JIS", "UTF-8");
  84. echo "Done!\n";
  85. ?>
  86. --EXPECT--
  87. SJIS verification and conversion works on all valid characters
  88. SJIS verification and conversion works on all invalid characters
  89. Unicode -> SJIS conversion works on all valid characters
  90. Unicode -> SJIS conversion works on all invalid characters
  91. Other mappings from Unicode -> SJIS are OK
  92. Done!