eucjp_encoding.phpt 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. --TEST--
  2. Exhaustive test of EUC-JP encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(555); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. /* Read in the table of all characters in EUC-JP */
  15. readConversionTable(__DIR__ . '/data/EUC-JP.txt', $validChars, $fromUnicode, true);
  16. /* The JIS X 0208 character set does not have a single, straightforward
  17. * mapping to the Unicode character set */
  18. /* Kuten code 0x2140 (EUC-JP 0xA1C0) is a backslash; this can be mapped to
  19. * 0x005C for an ordinary backslash, or 0xFF3C for a _fullwidth_ one
  20. * We go with fullwidth */
  21. $validChars["\xA1\xC0"] = "\x00\x00\xFF\x3C";
  22. $fromUnicode["\x00\x00\xFF\x3C"] = "\xA1\xC0";
  23. /* Unicode has both halfwidth and fullwidth NOT SIGN; convert both of them
  24. * to JIS X 0208 NOT SIGN */
  25. $fromUnicode["\x00\x00\xFF\xE2"] = "\xA2\xCC";
  26. /* Likewise for fullwidth and halfwidth POUND SIGN */
  27. $fromUnicode["\x00\x00\xFF\xE1"] = "\xA1\xF2";
  28. /* Likewise for fullwidth and halfwidth CENT SIGN */
  29. $fromUnicode["\x00\x00\xFF\xE0"] = "\xA1\xF1";
  30. /* Convert Unicode FULLWIDTH TILDE to JIS X 0208 WAVE DASH */
  31. $fromUnicode["\x00\x00\xFF\x5E"] = "\xA1\xC1";
  32. /* Convert Unicode FULLWIDTH HYPHEN-MINUS to JIS X 0208 MINUS SIGN */
  33. $fromUnicode["\x00\x00\xFF\x0D"] = "\xA1\xDD";
  34. /* Convert Unicode PARALLEL TO to JIS X 0208 DOUBLE VERTICAL LINE */
  35. $fromUnicode["\x00\x00\x22\x25"] = "\xA1\xC2";
  36. /* Unicode 0x007E (tilde) can be represented in two different ways in EUC-JP
  37. * When converting Unicode to EUC-JP, use the simpler representation */
  38. $fromUnicode["\x00\x00\x00\x7E"] = "\x7E";
  39. /* Likewise with 0x005C */
  40. $fromUnicode["\x00\x00\x00\x5C"] = "\x5C";
  41. /* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
  42. $fromUnicode["\x00\x00\x20\x3E"] = "\xA1\xB1";
  43. findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA1, 0xFE), 2) + array(0x8E => 2, 0x8F => 3));
  44. /* In the JIS X 0212 character set, kuten code 0x2237 (EUC-JP 0x8FA2B7)
  45. * is an ordinary tilde character
  46. * This mapping is not reversible, because ASCII 0x7E also represents
  47. * the same character */
  48. unset($validChars["\x8F\xA2\xB7"]);
  49. testAllValidChars($validChars, 'EUC-JP', 'UTF-32BE');
  50. echo "Encoding verification and conversion work for all valid characters\n";
  51. testAllInvalidChars($invalidChars, $validChars, 'EUC-JP', 'UTF-32BE', "\x00\x00\x00%");
  52. testTruncatedChars($truncated, 'EUC-JP', 'UTF-32BE', "\x00\x00\x00%");
  53. echo "Encoding verification and conversion work for all invalid characters\n";
  54. testValidString("\x8F\xA2\xB7", "\x00\x00\x00~", 'EUC-JP', 'UTF-32BE', false);
  55. echo "Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly\n";
  56. testAllValidChars($fromUnicode, 'UTF-32BE', 'EUC-JP', false);
  57. echo "Unicode -> EUC-JP conversion works on all valid characters\n";
  58. $invalidChars = array();
  59. for ($cp = 0; $cp <= 0xFFFF; $cp++) {
  60. $char = pack('N', $cp);
  61. if (!isset($fromUnicode[$char]))
  62. $invalidChars[$char] = true;
  63. }
  64. convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-32BE', 'EUC-JP', '%');
  65. echo "Unicode -> EUC-JP conversion works on all invalid characters\n";
  66. // Test "long" illegal character markers
  67. mb_substitute_character("long");
  68. convertInvalidString("\x80", "%", "EUC-JP", "UTF-8");
  69. convertInvalidString("\xFE\xFF", "%", "EUC-JP", "UTF-8");
  70. echo "Done!\n";
  71. ?>
  72. --EXPECT--
  73. Encoding verification and conversion work for all valid characters
  74. Encoding verification and conversion work for all invalid characters
  75. Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly
  76. Unicode -> EUC-JP conversion works on all valid characters
  77. Unicode -> EUC-JP conversion works on all invalid characters
  78. Done!