utf8_mobile_encodings.phpt 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. --TEST--
  2. Exhaustive test of UTF-8 text encoding (DoCoMo, KDDI, SoftBank variants)
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(855); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. $badUTF8 = array(
  15. // Codepoints outside of valid 0-0x10FFFF range for Unicode
  16. "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
  17. "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
  18. "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
  19. // Reserved range for UTF-16 surrogate pairs
  20. "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800
  21. "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF
  22. "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF
  23. // Truncated characters
  24. "\xDF" => "\x00\x00\x00%", // should have been 2-byte
  25. "\xEF\xBF" => "\x00\x00\x00%", // should have been 3-byte
  26. "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte
  27. // Multi-byte characters which end too soon and go to ASCII
  28. "\xDFA" => "\x00\x00\x00%\x00\x00\x00A",
  29. "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
  30. "\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
  31. "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
  32. // Multi-byte characters which end too soon and go to another MB char
  33. "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
  34. "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
  35. "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
  36. // Continuation bytes which appear outside of a MB char
  37. "\x80" => "\x00\x00\x00%",
  38. "A\x80" => "\x00\x00\x00A\x00\x00\x00%",
  39. "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%",
  40. // Overlong code units
  41. // (Using more bytes than needed to encode a character)
  42. "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes
  43. "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes
  44. "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
  45. );
  46. function intToString($value) {
  47. if ($value <= 0xFF)
  48. return chr($value);
  49. else if ($value <= 0xFFFF)
  50. return pack('n', $value);
  51. else if ($value <= 0xFFFFFF)
  52. return chr($value >> 16) . pack('n', $value & 0xFFFF);
  53. else
  54. return pack('N', $value);
  55. }
  56. function readUTF8ConversionTable($path, &$from, &$to, &$invalid) {
  57. $from = array();
  58. $to = array();
  59. $invalid = array();
  60. $fp = fopen($path, 'r+');
  61. while ($line = fgets($fp, 256)) {
  62. if (sscanf($line, "0x%x\t0x%x", $codepoint, $char) == 2) {
  63. $codepoint = pack('N', $codepoint);
  64. $char = intToString($char);
  65. $from[$char] = $codepoint;
  66. $to[$codepoint] = $char;
  67. } else if (sscanf($line, "0x%x\tBAD", $codepoint) == 1) {
  68. $codepoint = pack('N', $codepoint);
  69. $invalid[$codepoint] = true;
  70. }
  71. }
  72. }
  73. function testUTF8Variant($encoding, $filename) {
  74. readUTF8ConversionTable(__DIR__ . $filename, $toUnicode, $fromUnicode, $invalidCodepoints);
  75. // Test some plain, vanilla codepoints (to/from mobile encoding)
  76. testValidString("\x00\x00", "\x00", "UTF-16BE", $encoding);
  77. for ($i = 0; $i < 1000; $i++) {
  78. $cp = pack('N', rand(1, 0x10FFFF));
  79. if (isset($fromUnicode[$cp]))
  80. continue;
  81. if (mb_convert_encoding($cp, $encoding, 'UTF-32BE') !== mb_convert_encoding($cp, 'UTF-8', 'UTF-32BE'))
  82. die("Expected U+" . bin2hex($cp) . " to be the same in UTF-8 and " . $encoding);
  83. }
  84. if ($encoding === 'UTF-8-Mobile#DOCOMO') {
  85. // In Docomo Shift-JIS, we have mappings for U+FEE16 up to U+FEE25 and
  86. // then U+FEE29-U+FEE2B, U+FEE2D-U+FEE33
  87. // These correspond to sequential Docomo SJIS codes, but in the middle there is
  88. // one emoji which converts to U+25EA (SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK)
  89. // However, when converting Unicode to Docomo vendor-specific encodings, we still
  90. // accept U+FEE26 and convert it to the same SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK emoji
  91. // So our mapping for U+FEE26 is not reversible
  92. // Encoded as UTF-8, that's EE9B80
  93. unset($toUnicode["\xEE\x9B\x80"]);
  94. // Similar for U+FEE27, U+FEE28, U+FEE2C
  95. unset($toUnicode["\xEE\x9B\x81"]);
  96. unset($toUnicode["\xEE\x9B\x82"]);
  97. unset($toUnicode["\xEE\x9B\x86"]);
  98. }
  99. // Test all characters which are different in mobile encoding (from standard UTF-8)
  100. foreach ($toUnicode as $char => $cp)
  101. testValidString($char, $cp, $encoding, 'UCS-4BE', false);
  102. foreach ($fromUnicode as $cp => $char)
  103. testValidString($cp, $char, 'UCS-4BE', $encoding, false);
  104. foreach ($invalidCodepoints as $cp => $_)
  105. convertInvalidString($cp, '%', 'UCS-4BE', $encoding);
  106. // Try malformed UTF-8 sequences
  107. global $badUTF8;
  108. foreach ($badUTF8 as $invalidText => $expectedResult)
  109. testInvalidString($invalidText, $expectedResult, $encoding, 'UCS-4BE');
  110. echo "$encoding OK\n";
  111. }
  112. testUTF8Variant('UTF-8-Mobile#DOCOMO', '/data/UTF-8-DOCOMO.txt');
  113. testUTF8Variant('UTF-8-Mobile#KDDI-A', '/data/UTF-8-KDDI-A.txt');
  114. testUTF8Variant('UTF-8-Mobile#KDDI-B', '/data/UTF-8-KDDI-B.txt');
  115. testUTF8Variant('UTF-8-Mobile#SOFTBANK', '/data/UTF-8-SOFTBANK.txt');
  116. ?>
  117. --EXPECT--
  118. UTF-8-Mobile#DOCOMO OK
  119. UTF-8-Mobile#KDDI-A OK
  120. UTF-8-Mobile#KDDI-B OK
  121. UTF-8-Mobile#SOFTBANK OK