eucjp_2004_encoding.phpt 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. --TEST--
  2. Exhaustive test of EUC-JP-2004 encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(200); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. $validChars = array(); /* EUC-JP-2004 string -> UTF-32BE */
  15. $fromUnicode = array(); /* UTF-16BE -> EUC-JP-2004 */
  16. $fp = fopen(__DIR__ . '/data/EUC-JP-2004.txt', 'r+');
  17. while ($line = fgets($fp, 256)) {
  18. if ($line[0] == '#')
  19. continue;
  20. $codepoint2 = null;
  21. if (sscanf($line, "0x%x\tU+%x+%x", $bytes, $codepoint1, $codepoint2) >= 2) {
  22. if ($bytes < 256)
  23. $eucjp = chr($bytes);
  24. else if ($bytes <= 0xFFFF)
  25. $eucjp = pack('n', $bytes);
  26. else
  27. $eucjp = chr($bytes >> 16) . pack('n', $bytes & 0xFFFF);
  28. if ($codepoint2) {
  29. $validChars[$eucjp] = pack('NN', $codepoint1, $codepoint2);
  30. } else {
  31. $validChars[$eucjp] = pack('N', $codepoint1);
  32. if ($codepoint1 <= 0xFFFF)
  33. $fromUnicode[pack('n', $codepoint1)] = $eucjp;
  34. }
  35. }
  36. }
  37. /* Convert 0xA1B1 to U+FFE3 (FULLWIDTH MACRON), not U+203E (OVERLINE) */
  38. $validChars["\xA1\xB1"] = "\x00\x00\xFF\xE3";
  39. $fromUnicode["\xFF\xE3"] = "\xA1\xB1";
  40. /* Convert 0xA1EF to U+FFE5 (FULLWIDTH YEN SIGN), not U+00A5 (YEN SIGN) */
  41. $validChars["\xA1\xEF"] = "\x00\x00\xFF\xE5";
  42. $fromUnicode["\xFF\xE5"] = "\xA1\xEF";
  43. /* Convert U+00A5 (YEN SIGN) to 0x5C; that is one of the single bytes
  44. * which many legacy Japanese text encodings used to represent something
  45. * different from its normal meaning ASCII. In ASCII it's a backslash,
  46. * but legacy Japanese software often used it for a yen sign. */
  47. $fromUnicode["\x00\xA5"] = "\x5C";
  48. /* The other one is 0x7E, which is a tilde in ASCII, but was used in
  49. * legacy Japanese software for an overline */
  50. $fromUnicode["\x20\x3E"] = "\x7E";
  51. testAllValidChars($validChars, 'EUC-JP-2004', 'UTF-32BE');
  52. echo "EUC-JP-2004 verification and conversion works for all valid characters\n";
  53. findInvalidChars($validChars, $invalidChars, $truncated);
  54. testAllInvalidChars($invalidChars, $validChars, 'EUC-JP-2004', 'UTF-32BE', "\x00\x00\x00%");
  55. testTruncatedChars($truncated, 'EUC-JP-2004', 'UTF-32BE', "\x00\x00\x00%");
  56. echo "EUC-JP-2004 verification and conversion rejects all invalid characters\n";
  57. testAllValidChars($fromUnicode, 'UTF-16BE', 'EUC-JP-2004', false);
  58. echo "Unicode -> EUC-JP-2004 conversion works on all valid characters\n";
  59. findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
  60. convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'EUC-JP-2004', '%');
  61. echo "Unicode -> EUC-JP-2004 conversion works on all invalid characters\n";
  62. // Test "long" illegal character markers
  63. mb_substitute_character("long");
  64. convertInvalidString("\x80", "%", "EUC-JP-2004", "UTF-8");
  65. convertInvalidString("\xFE\xFF", "%", "EUC-JP-2004", "UTF-8");
  66. echo "Done!\n";
  67. ?>
  68. --EXPECT--
  69. EUC-JP-2004 verification and conversion works for all valid characters
  70. EUC-JP-2004 verification and conversion rejects all invalid characters
  71. Unicode -> EUC-JP-2004 conversion works on all valid characters
  72. Unicode -> EUC-JP-2004 conversion works on all invalid characters
  73. Done!