sjis2004_encoding.phpt 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. --TEST--
  2. Exhaustive test of SJIS-2004 encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(101); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. /* Read in the table of all characters in SJIS-2004 */
  15. $validChars = array(); /* SJIS-2004 string -> UTF-32BE string */
  16. $fromUnicode = array(); /* UTF-16BE -> SJIS-2004 */
  17. $fp = fopen(__DIR__ . '/data/SJIS-2004.txt', 'r+');
  18. while ($line = fgets($fp, 256)) {
  19. if ($line[0] == '#')
  20. continue;
  21. $codepoint2 = null;
  22. if (sscanf($line, "0x%x\tU+%x+%x", $bytes, $codepoint1, $codepoint2) >= 2) {
  23. $sjis = ($bytes < 256) ? chr($bytes) : pack('n', $bytes);
  24. if ($codepoint2) {
  25. $validChars[$sjis] = pack('NN', $codepoint1, $codepoint2);
  26. } else {
  27. /* Two input byte sequences can translate to either a 'halfwidth' or a
  28. * 'fullwidth' version of a character; our implementation of SJIS-2004
  29. * translates them to the fullwidth versions */
  30. if (preg_match('/Fullwidth: U\+([0-9A-F]+)/', $line, $match))
  31. $codepoint1 = hexdec($match[1]);
  32. $validChars[$sjis] = pack('N', $codepoint1);
  33. if ($codepoint1 <= 0xFFFF)
  34. $fromUnicode[pack('n', $codepoint1)] = $sjis;
  35. }
  36. }
  37. }
  38. $fromUnicode["\x00\x7E"] = "\x7E";
  39. $fromUnicode["\x00\x5C"] = "\x5C";
  40. testAllValidChars($validChars, 'SJIS-2004', 'UTF-32BE');
  41. echo "SJIS-2004 verification and conversion works for all valid characters\n";
  42. findInvalidChars($validChars, $invalidChars, $truncated,
  43. array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));
  44. testAllInvalidChars($invalidChars, $validChars, 'SJIS-2004', 'UTF-32BE', "\x00\x00\x00%");
  45. testTruncatedChars($truncated, 'SJIS-2004', 'UTF-32BE', "\x00\x00\x00%");
  46. echo "SJIS-2004 verification and conversion rejects all invalid characters\n";
  47. testAllValidChars($fromUnicode, 'UTF-16BE', 'SJIS-2004', false);
  48. echo "Unicode -> SJIS-2004 conversion works on all valid characters\n";
  49. findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
  50. convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-2004', '%');
  51. echo "Unicode -> SJIS-2004 conversion works on all invalid characters\n";
  52. // Some pairs of Unicode codepoints are represented by a single character in SJIS-2004
  53. // Test the case where the first codepoint looks like it might be one of these pairs...
  54. // but the second one doesn't match
  55. convertValidString("\x30\x4B\x00A", "\x82\xA9A", 'UTF-16BE', 'SJIS-2004', false);
  56. // Test "long" illegal character markers
  57. mb_substitute_character("long");
  58. convertInvalidString("\x80", "%", "SJIS-2004", "UTF-8");
  59. convertInvalidString("\x81\x20", "%", "SJIS-2004", "UTF-8");
  60. convertInvalidString("\xFC\xF5", "%", "SJIS-2004", "UTF-8");
  61. echo "Done!\n";
  62. ?>
  63. --EXPECT--
  64. SJIS-2004 verification and conversion works for all valid characters
  65. SJIS-2004 verification and conversion rejects all invalid characters
  66. Unicode -> SJIS-2004 conversion works on all valid characters
  67. Unicode -> SJIS-2004 conversion works on all invalid characters
  68. Done!