sjis_mobile_encodings.phpt 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. --TEST--
  2. Exhaustive test of Shift-JIS DoCoMo, KDDI, SoftBank encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. srand(818); /* Make results consistent */
  12. include('encoding_tests.inc');
  13. mb_substitute_character(0x25); // '%'
  14. /* Read in the table of all characters in Windows-932
  15. * (The SJIS-Mobile encodings all use MS extensions) */
  16. readConversionTable(__DIR__ . '/data/CP932.txt', $sjisChars, $fromUnicode, true);
  17. /* U+301C (WAVE DASH) converts to SJIS 0x8160 (WAVE DASH) */
  18. $fromUnicode["\x00\x00\x30\x1C"] = "\x81\x60";
  19. /* U+2212 (MINUS SIGN) converts to SJIS 0x817C (FULLWIDTH HYPHEN-MINUS) */
  20. $fromUnicode["\x00\x00\x22\x12"] = "\x81\x7C";
  21. /* U+203E (OVERLINE) converts to SJIS 0x8150 (FULLWIDTH MACRON) */
  22. $fromUnicode["\x00\x00\x20\x3E"] = "\x81\x50";
  23. /* U+2016 (DOUBLE VERTICAL LINE) converts to SJIS 0x8161 (PARALLEL TO) */
  24. $fromUnicode["\x00\x00\x20\x16"] = "\x81\x61";
  25. /* U+00AF (MACRON) converts to SJIS 0x8150 (FULLWIDTH MACRON) */
  26. $fromUnicode["\x00\x00\x00\xAF"] = "\x81\x50";
  27. /* U+00AC (NOT SIGN) converts to SJIS 0x81CA (FULLWIDTH NOT SIGN) */
  28. $fromUnicode["\x00\x00\x00\xAC"] = "\x81\xCA";
  29. /* U+00A5 (YEN SIGN) converts to SJIS 0x818F (FULLWIDTH YEN SIGN) */
  30. $fromUnicode["\x00\x00\x00\xA5"] = "\x81\x8F";
  31. /* U+00A3 (POUND SIGN) converts to SJIS 0x8192 (FULLWIDTH POUND SIGN) */
  32. $fromUnicode["\x00\x00\x00\xA3"] = "\x81\x92";
  33. /* U+00A2 (CENT SIGN) converts to SJIS 0x8191 (FULLWIDTH CENT SIGN) */
  34. $fromUnicode["\x00\x00\x00\xA2"] = "\x81\x91";
  35. /* Aside from the characters in that table, we also support a 'user' area
  36. * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */
  37. $codepoint = 0xE000;
  38. for ($i = 0xF0; $i <= 0xF9; $i++) {
  39. for ($j = 0x40; $j <= 0xFC; $j++) {
  40. if ($j == 0x7F)
  41. continue;
  42. $utf32 = pack('N', $codepoint);
  43. $cp932 = chr($i) . chr($j);
  44. $sjisChars[$cp932] = $utf32;
  45. $fromUnicode[$utf32] = $cp932;
  46. $codepoint++;
  47. }
  48. }
  49. $invalidCodepoints = array();
  50. for ($i = 0; $i <= 0xFFFF; $i++) {
  51. $cp = pack('N', $i);
  52. if (!isset($fromUnicode[$cp]))
  53. $invalidCodepoints[$cp] = true;
  54. }
  55. /* Windows-932 has many cases where two different kuten codes map to the same
  56. * Unicode codepoints
  57. *
  58. * Everything from 0xED00-0xEEFF falls in this unfortunate category
  59. * (Other sequences in 0xFA00-0xFC4B map to the same codepoints.)
  60. * Our implementation of CP932 prefers the F's, but for SJIS-Mobile,
  61. * we prefer the E's */
  62. $nonInvertible = array();
  63. for ($i = 0xFA00; $i <= 0xFC4B; $i++) {
  64. $bytes = pack('n', $i);
  65. if (isset($sjisChars[$bytes])) {
  66. $nonInvertible[$bytes] = $sjisChars[$bytes];
  67. unset($fromUnicode[$sjisChars[$bytes]]);
  68. }
  69. }
  70. /* Other "collisions" */
  71. foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xEEF9] as $i) {
  72. $bytes = pack('n', $i);
  73. $nonInvertible[$bytes] = $sjisChars[$bytes];
  74. unset($fromUnicode[$sjisChars[$bytes]]);
  75. }
  76. $nonInvertibleSoftbank = $nonInvertible;
  77. $nonInvertibleDocomo = $nonInvertible;
  78. /* Now read table of vendor-specific emoji encodings */
  79. $docomo = $sjisChars;
  80. $kddi = $sjisChars;
  81. $softbank = $sjisChars;
  82. $sbEmoji = array();
  83. $fp = fopen(realpath(__DIR__ . '/data/EmojiSources.txt'), 'r+');
  84. while ($line = fgets($fp, 256)) {
  85. if ($line[0] == '#')
  86. continue;
  87. $fields = explode(';', rtrim($line));
  88. if (count($fields) >= 4) {
  89. if (sscanf($fields[0], "%x %x", $cp1, $cp2) == 2) {
  90. $utf32 = pack('N', $cp1) . pack('N', $cp2);
  91. } else {
  92. $utf32 = pack('N', hexdec($fields[0]));
  93. unset($invalidCodepoints[$utf32]);
  94. }
  95. if ($fields[1])
  96. $docomo[pack('n', hexdec($fields[1]))] = $utf32;
  97. if ($fields[2])
  98. $kddi[pack('n', hexdec($fields[2]))] = $utf32;
  99. if ($fields[3]) {
  100. $bytes = pack('n', hexdec($fields[3]));
  101. $sbEmoji[$bytes] = $utf32;
  102. unset($nonInvertibleSoftbank[$bytes]);
  103. }
  104. }
  105. }
  106. /* Other, vendor-specific emoji which do not appear in EmojiSources.txt
  107. * Most of these don't exist in Unicode and have been mapped to 'private
  108. * area' codepoints */
  109. $docomo["\xF9\x4A"] = "\x00\x0F\xEE\x16"; // PIAS PI
  110. $docomo["\xF9\x4B"] = "\x00\x0F\xEE\x17"; // PIAS A
  111. $docomo["\xF9\x4C"] = "\x00\x0F\xEE\x18"; // INVERSE TICKET
  112. $docomo["\xF9\x4D"] = "\x00\x0F\xEE\x19"; // KATAKANA ABBREVIATION FOR TICKET ("chi ke")
  113. $docomo["\xF9\x4E"] = "\x00\x0F\xEE\x1A"; // RESERVE BY PHONE
  114. $docomo["\xF9\x4F"] = "\x00\x0F\xEE\x1B"; // P CODE
  115. $docomo["\xF9\x53"] = "\x00\x0F\xEE\x1C"; // MOVIES 2
  116. $docomo["\xF9\x54"] = "\x00\x0F\xEE\x1D"; // PIAS PI INVERSE
  117. $docomo["\xF9\x58"] = "\x00\x0F\xEE\x1E"; // PIAS PI CIRCLE
  118. $docomo["\xF9\x59"] = "\x00\x0F\xEE\x1F"; // PIAS PI SQUARE
  119. $docomo["\xF9\x5A"] = "\x00\x0F\xEE\x20"; // CHECK
  120. $docomo["\xF9\x5F"] = "\x00\x0F\xEE\x21"; // F
  121. $docomo["\xF9\x60"] = "\x00\x0F\xEE\x22"; // D
  122. $docomo["\xF9\x61"] = "\x00\x0F\xEE\x23"; // S
  123. $docomo["\xF9\x62"] = "\x00\x0F\xEE\x24"; // C
  124. $docomo["\xF9\x63"] = "\x00\x0F\xEE\x25"; // R
  125. $docomo["\xF9\x64"] = "\x00\x00\x25\xEA"; // SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK
  126. $nonInvertibleDocomo["\xF9\x64"] = "\x00\x00\x25\xEA";
  127. $docomo["\xF9\x65"] = "\x00\x00\x25\xA0"; // BLACK SQUARE
  128. $nonInvertibleDocomo["\xF9\x65"] = "\x00\x00\x25\xA0";
  129. $docomo["\xF9\x66"] = "\x00\x00\x25\xBF"; // DOWNWARD TRIANGLE
  130. $nonInvertibleDocomo["\xF9\x66"] = "\x00\x00\x25\xBF";
  131. /* TODO: test that FEE28 converts to F966, for backwards compatibility */
  132. $docomo["\xF9\x67"] = "\x00\x0F\xEE\x29"; // QUADRUPLE DAGGER
  133. $docomo["\xF9\x68"] = "\x00\x0F\xEE\x2A"; // TRIPLE DAGGER
  134. $docomo["\xF9\x69"] = "\x00\x0F\xEE\x2B"; // DOUBLE DAGGER
  135. $docomo["\xF9\x6A"] = "\x00\x00\x20\x20"; // DAGGER
  136. $nonInvertibleDocomo["\xF9\x6A"] = "\x00\x00\x20\x20";
  137. /* TODO: test that FEE2C converts to F96A, for backwards compatibility */
  138. $docomo["\xF9\x6B"] = "\x00\x0F\xEE\x2D"; // I (meaning "inexpensive")
  139. $docomo["\xF9\x6C"] = "\x00\x0F\xEE\x2E"; // M (meaning "moderate")
  140. $docomo["\xF9\x6D"] = "\x00\x0F\xEE\x2F"; // E (meaning "expensive")
  141. $docomo["\xF9\x6E"] = "\x00\x0F\xEE\x30"; // VE (meaning "very expensive")
  142. $docomo["\xF9\x6F"] = "\x00\x0F\xEE\x31"; // SPHERE
  143. $docomo["\xF9\x70"] = "\x00\x0F\xEE\x32"; // CREDIT CARDS NOT ACCEPTED
  144. $docomo["\xF9\x71"] = "\x00\x0F\xEE\x33"; // CHECKBOX
  145. $docomo["\xF9\x75"] = "\x00\x0F\xEE\x10"; // I-MODE
  146. $docomo["\xF9\x76"] = "\x00\x0F\xEE\x11"; // I-MODE WITH FRAME
  147. $docomo["\xF9\x78"] = "\x00\x0F\xEE\x12"; // PROVIDED BY DOCOMO
  148. $docomo["\xF9\x79"] = "\x00\x0F\xEE\x13"; // DOCOMO POINT
  149. $docomo["\xF9\x84"] = "\x00\x00\x27\xBF"; // FREE DIAL; mapped to DOUBLE CURLY LOOP
  150. unset($invalidCodepoints["\x00\x00\x27\xBF"]);
  151. $docomo["\xF9\x86"] = "\x00\x0F\xE8\x2D"; // MOBILE Q
  152. $docomo["\xF9\xB1"] = "\x00\x0F\xEE\x14"; // I-APPLI
  153. $docomo["\xF9\xB2"] = "\x00\x0F\xEE\x15"; // I-APPLI WITH BORDER
  154. $kddi["\xF7\x94"] = "\x00\x0F\xEE\x40"; // EZ WEB
  155. $kddi["\xF7\xCF"] = "\x00\x0F\xEE\x41"; // EZ PLUS
  156. $kddi["\xF3\x70"] = "\x00\x0F\xEE\x42"; // EZ NAVIGATION
  157. $kddi["\xF4\x78"] = "\x00\x0F\xEE\x43"; // EZ MOVIE
  158. $kddi["\xF4\x86"] = "\x00\x0F\xEE\x44"; // CMAIL
  159. $kddi["\xF4\x8E"] = "\x00\x0F\xEE\x45"; // JAVA (TM)
  160. $kddi["\xF4\x8F"] = "\x00\x0F\xEE\x46"; // BREW
  161. $kddi["\xF4\x90"] = "\x00\x0F\xEE\x47"; // EZ RING MUSIC
  162. $kddi["\xF4\x91"] = "\x00\x0F\xEE\x48"; // EZ NAVI
  163. $kddi["\xF4\x92"] = "\x00\x0F\xEE\x49"; // WIN
  164. $kddi["\xF4\x93"] = "\x00\x0F\xEE\x4A"; // PREMIUM SIGN
  165. $kddi["\xF7\x48"] = "\x00\x0F\xE8\x2D"; // MOBILE Q
  166. $kddi["\xF7\xA3"] = "\x00\x0F\xE8\x3C"; // PDC ("personal digital cellular")
  167. $kddi["\xF7\xD2"] = "\x00\x0F\xEB\x89"; // OPENWAVE
  168. $sbEmoji["\xF7\xB1"] = "\x00\x00\x27\xBF"; // FREE DIAL; mapped to DOUBLE CURLY
  169. $sbEmoji["\xF7\xF4"] = "\x00\x0F\xEE\x77"; // J-PHONE SHOP
  170. $sbEmoji["\xF7\xF5"] = "\x00\x0F\xEE\x78"; // SKY WEB
  171. $sbEmoji["\xF7\xF6"] = "\x00\x0F\xEE\x79"; // SKY WALKER
  172. $sbEmoji["\xF7\xF7"] = "\x00\x0F\xEE\x7A"; // SKY MELODY
  173. $sbEmoji["\xF7\xF8"] = "\x00\x0F\xEE\x7B"; // J-PHONE 1
  174. $sbEmoji["\xF7\xF9"] = "\x00\x0F\xEE\x7C"; // J-PHONE 2
  175. $sbEmoji["\xF7\xFA"] = "\x00\x0F\xEE\x7D"; // J-PHONE 3
  176. /* SoftBank-specific 'JSky1', 'JSky2', 'VODAFONE1', 'VODAFONE2', etc. emoji,
  177. * which are not supported by Unicode */
  178. for ($i = 0xFBD8; $i <= 0xFBDE; $i++) {
  179. $bytes = pack('n', $i);
  180. $sbEmoji[$bytes] = pack('N', 0xFEE70 + $i - 0xFBD8);
  181. unset($nonInvertibleSoftbank[$bytes]);
  182. }
  183. /* SoftBank-specific emoji for Shibuya department store */
  184. $sbEmoji["\xFB\xAA"] = "\x00\x0F\xE4\xC5";
  185. unset($nonInvertibleSoftbank["\xFB\xAA"]);
  186. $softbank = array_merge($softbank, $sbEmoji);
  187. /* For Softbank, we support an alternative representation for emoji which
  188. * uses sequences starting with ESC. Apparently this was used in older
  189. * versions of Softbank's phones.
  190. * ESC could be followed by 6 different ASCII characters, each of which
  191. * represented a different ku code */
  192. $escCodeToKu = array('G' => 0x91, 'E' => 0x8D, 'F' => 0x8E, 'O' => 0x92, 'P' => 0x95, 'Q' => 0x96);
  193. $escCodeMaxTen = array('G' => 0x7A, 'E' => 0x7A, 'F' => 0x7A, 'O' => 0x6D, 'P' => 0x6C, 'Q' => 0x5E);
  194. function shiftJISEncode($ku, $ten) {
  195. $ku -= 0x21;
  196. $ten -= 0x21;
  197. $hiBits = $ku >> 1;
  198. $loBit = $ku % 2;
  199. if ($hiBits < 31) {
  200. $sjis = chr($hiBits + 0x81);
  201. } else {
  202. $sjis = chr($hiBits - 31 + 0xE0);
  203. }
  204. if ($loBit == 0) {
  205. if ($ten < 63)
  206. return $sjis . chr($ten + 0x40);
  207. else
  208. return $sjis . chr($ten - 63 + 0x80);
  209. } else {
  210. return $sjis . chr($ten + 0x9F);
  211. }
  212. }
  213. foreach ($escCodeToKu as $char => $ku) {
  214. for ($ten = 0x21; $ten <= $escCodeMaxTen[$char]; $ten++) {
  215. $sjis = shiftJISEncode($ku, $ten);
  216. if (isset($sbEmoji[$sjis])) {
  217. $bytes = "\x1B\$" . $char . chr($ten);
  218. $unicode = $softbank[$sjis];
  219. $nonInvertibleSoftbank[$bytes] = $softbank[$bytes] = $unicode;
  220. }
  221. }
  222. }
  223. /* A bare ESC is not valid for Softbank, since it is used for escape sequences
  224. * which represent emoji */
  225. unset($softbank["\x1B"]);
  226. function testSJISVariant($validChars, $nonInvertible, $encoding) {
  227. global $fromUnicode, $invalidCodepoints, $escCodeToKu;
  228. $lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
  229. findInvalidChars($validChars, $invalidChars, $truncated, $lenTable);
  230. foreach ($escCodeToKu as $char => $unused) {
  231. unset($invalidChars["\x1B\$" . $char . "\x0F"]);
  232. unset($truncated["\x1B\$" . $char]);
  233. }
  234. $escapes = [];
  235. foreach ($nonInvertible as $bytes => $unicode) {
  236. unset($validChars[$bytes]);
  237. if (substr($bytes, 0, 1) === "\x1B")
  238. array_push($escapes, $bytes);
  239. }
  240. /* 0xF is used to terminate a run of emoji encoded using ESC sequence
  241. * We couldn't do this earlier or `findInvalidChars` wouldn't have worked
  242. * as desired */
  243. foreach ($escapes as $bytes) {
  244. $nonInvertible[$bytes . "\x0F"] = $nonInvertible[$bytes];
  245. unset($nonInvertible[$bytes]);
  246. }
  247. testAllValidChars($validChars, $encoding, 'UTF-32BE');
  248. testAllValidChars($nonInvertible, $encoding, 'UTF-32BE', false);
  249. echo "$encoding verification and conversion works on all valid characters\n";
  250. testAllInvalidChars($invalidChars, $validChars, $encoding, 'UTF-32BE', "\x00\x00\x00%");
  251. testTruncatedChars($truncated, $encoding, 'UTF-32BE', "\x00\x00\x00%");
  252. echo "$encoding verification and conversion works on all invalid characters\n";
  253. convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-32BE', $encoding, '%');
  254. echo "Unicode -> $encoding conversion works on all invalid codepoints\n";
  255. // Test "long" illegal character markers
  256. mb_substitute_character("long");
  257. convertInvalidString("\x80", "%", $encoding, "UTF-8");
  258. convertInvalidString("\x81\x20", "%", $encoding, "UTF-8");
  259. convertInvalidString("\xEA\xA9", "%", $encoding, "UTF-8");
  260. mb_substitute_character(0x25); // '%'
  261. // Test Regional Indicator codepoint at end of string
  262. // The mobile SJIS variants all have special characters to represent certain national
  263. // flags, but in Unicode these are represented by a sequence of _two_ codepoints
  264. // So if only one of those two codepoints appears at the end of a string, it can't
  265. // be converted to SJIS and should be treated as an error
  266. convertInvalidString("\x00\x01\xF1\xE6", "%", "UTF-32BE", $encoding); // Regional Indicator A
  267. }
  268. testSJISVariant($docomo, $nonInvertibleDocomo, 'SJIS-Mobile#DOCOMO');
  269. testSJISVariant($kddi, $nonInvertible, 'SJIS-Mobile#KDDI');
  270. testSJISVariant($softbank, $nonInvertibleSoftbank, 'SJIS-Mobile#SOFTBANK');
  271. ?>
  272. --EXPECT--
  273. SJIS-Mobile#DOCOMO verification and conversion works on all valid characters
  274. SJIS-Mobile#DOCOMO verification and conversion works on all invalid characters
  275. Unicode -> SJIS-Mobile#DOCOMO conversion works on all invalid codepoints
  276. SJIS-Mobile#KDDI verification and conversion works on all valid characters
  277. SJIS-Mobile#KDDI verification and conversion works on all invalid characters
  278. Unicode -> SJIS-Mobile#KDDI conversion works on all invalid codepoints
  279. SJIS-Mobile#SOFTBANK verification and conversion works on all valid characters
  280. SJIS-Mobile#SOFTBANK verification and conversion works on all invalid characters
  281. Unicode -> SJIS-Mobile#SOFTBANK conversion works on all invalid codepoints