123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316 |
- --TEST--
- Exhaustive test of verification and conversion of GB18030 text
- --EXTENSIONS--
- mbstring
- --SKIPIF--
- <?php
- if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
- ?>
- --FILE--
- <?php
- include('encoding_tests.inc');
- srand(1111); // Make results consistent
- mb_substitute_character(0x25); // '%'
- readConversionTable(__DIR__ . '/data/GB18030-2byte.txt', $toUnicode, $fromUnicode);
- /* GB18030 represents all Unicode codepoints in the BMP which are _not_ covered by any
- * 2-byte GB18030 codepoint as a 4-byte code, with each of the 4 bytes in the following ranges:
- *
- * - 1st byte: 0x81-0x84
- * - 2nd byte: 0x30-0x39
- * - 3rd byte: 0x81-0xFE
- * - 4th byte: 0x30-0x39
- *
- * These start from 0x81308130 and count upwards one by one, with all the Unicode codepoints
- * which need to be represented as a 4-byte code appearing in sequence.
- *
- * Each subarray here is: [starting GB18030 codepoint bytes (4 of them), Unicode codepoint which it
- * converts to, number of sequential Unicode codepoints represented by sequential GB18030 codepoints] */
- $gb18030_BMP_Mappings = [
- [0x81, 0x30, 0x81, 0x30, 0x80, 36],
- [0x81, 0x30, 0x84, 0x36, 0xa5, 2],
- [0x81, 0x30, 0x84, 0x38, 0xa9, 7],
- [0x81, 0x30, 0x85, 0x35, 0xb2, 5],
- [0x81, 0x30, 0x86, 0x30, 0xb8, 31],
- [0x81, 0x30, 0x89, 0x31, 0xd8, 8],
- [0x81, 0x30, 0x89, 0x39, 0xe2, 6],
- [0x81, 0x30, 0x8a, 0x35, 0xeb, 1],
- [0x81, 0x30, 0x8a, 0x36, 0xee, 4],
- [0x81, 0x30, 0x8b, 0x30, 0xf4, 3],
- [0x81, 0x30, 0x8b, 0x33, 0xf8, 1],
- [0x81, 0x30, 0x8b, 0x34, 0xfb, 1],
- [0x81, 0x30, 0x8b, 0x35, 0xfd, 4],
- [0x81, 0x30, 0x8b, 0x39, 0x102, 17],
- [0x81, 0x30, 0x8d, 0x36, 0x114, 7],
- [0x81, 0x30, 0x8e, 0x33, 0x11c, 15],
- [0x81, 0x30, 0x8f, 0x38, 0x12c, 24],
- [0x81, 0x30, 0x92, 0x32, 0x145, 3],
- [0x81, 0x30, 0x92, 0x35, 0x149, 4],
- [0x81, 0x30, 0x92, 0x39, 0x14e, 29],
- [0x81, 0x30, 0x95, 0x38, 0x16c, 98],
- [0x81, 0x30, 0x9f, 0x36, 0x1cf, 1],
- [0x81, 0x30, 0x9f, 0x37, 0x1d1, 1],
- [0x81, 0x30, 0x9f, 0x38, 0x1d3, 1],
- [0x81, 0x30, 0x9f, 0x39, 0x1d5, 1],
- [0x81, 0x30, 0xa0, 0x30, 0x1d7, 1],
- [0x81, 0x30, 0xa0, 0x31, 0x1d9, 1],
- [0x81, 0x30, 0xa0, 0x32, 0x1db, 1],
- [0x81, 0x30, 0xa0, 0x33, 0x1dd, 28],
- [0x81, 0x30, 0xa3, 0x31, 0x1fa, 87],
- [0x81, 0x30, 0xab, 0x38, 0x252, 15],
- [0x81, 0x30, 0xad, 0x33, 0x262, 101],
- [0x81, 0x30, 0xb7, 0x34, 0x2c8, 1],
- [0x81, 0x30, 0xb7, 0x35, 0x2cc, 13],
- [0x81, 0x30, 0xb8, 0x38, 0x2da, 183],
- [0x81, 0x30, 0xcb, 0x31, 0x3a2, 1],
- [0x81, 0x30, 0xcb, 0x32, 0x3aa, 7],
- [0x81, 0x30, 0xcb, 0x39, 0x3c2, 1],
- [0x81, 0x30, 0xcc, 0x30, 0x3ca, 55],
- [0x81, 0x30, 0xd1, 0x35, 0x402, 14],
- [0x81, 0x30, 0xd2, 0x39, 0x450, 1],
- [0x81, 0x30, 0xd3, 0x30, 0x452, 7102],
- [0x81, 0x36, 0xa5, 0x32, 0x2011, 2],
- [0x81, 0x36, 0xa5, 0x34, 0x2017, 1],
- [0x81, 0x36, 0xa5, 0x35, 0x201a, 2],
- [0x81, 0x36, 0xa5, 0x37, 0x201e, 7],
- [0x81, 0x36, 0xa6, 0x34, 0x2027, 9],
- [0x81, 0x36, 0xa7, 0x33, 0x2031, 1],
- [0x81, 0x36, 0xa7, 0x34, 0x2034, 1],
- [0x81, 0x36, 0xa7, 0x35, 0x2036, 5],
- [0x81, 0x36, 0xa8, 0x30, 0x203c, 112],
- [0x81, 0x36, 0xb3, 0x32, 0x20ad, 86],
- [0x81, 0x36, 0xbb, 0x38, 0x2104, 1],
- [0x81, 0x36, 0xbb, 0x39, 0x2106, 3],
- [0x81, 0x36, 0xbc, 0x32, 0x210a, 12],
- [0x81, 0x36, 0xbd, 0x34, 0x2117, 10],
- [0x81, 0x36, 0xbe, 0x34, 0x2122, 62],
- [0x81, 0x36, 0xc4, 0x36, 0x216c, 4],
- [0x81, 0x36, 0xc5, 0x30, 0x217a, 22],
- [0x81, 0x36, 0xc7, 0x32, 0x2194, 2],
- [0x81, 0x36, 0xc7, 0x34, 0x219a, 110],
- [0x81, 0x36, 0xd2, 0x34, 0x2209, 6],
- [0x81, 0x36, 0xd3, 0x30, 0x2210, 1],
- [0x81, 0x36, 0xd3, 0x31, 0x2212, 3],
- [0x81, 0x36, 0xd3, 0x34, 0x2216, 4],
- [0x81, 0x36, 0xd3, 0x38, 0x221b, 2],
- [0x81, 0x36, 0xd4, 0x30, 0x2221, 2],
- [0x81, 0x36, 0xd4, 0x32, 0x2224, 1],
- [0x81, 0x36, 0xd4, 0x33, 0x2226, 1],
- [0x81, 0x36, 0xd4, 0x34, 0x222c, 2],
- [0x81, 0x36, 0xd4, 0x36, 0x222f, 5],
- [0x81, 0x36, 0xd5, 0x31, 0x2238, 5],
- [0x81, 0x36, 0xd5, 0x36, 0x223e, 10],
- [0x81, 0x36, 0xd6, 0x36, 0x2249, 3],
- [0x81, 0x36, 0xd6, 0x39, 0x224d, 5],
- [0x81, 0x36, 0xd7, 0x34, 0x2253, 13],
- [0x81, 0x36, 0xd8, 0x37, 0x2262, 2],
- [0x81, 0x36, 0xd8, 0x39, 0x2268, 6],
- [0x81, 0x36, 0xd9, 0x35, 0x2270, 37],
- [0x81, 0x36, 0xdd, 0x32, 0x2296, 3],
- [0x81, 0x36, 0xdd, 0x35, 0x229a, 11],
- [0x81, 0x36, 0xde, 0x36, 0x22a6, 25],
- [0x81, 0x36, 0xe1, 0x31, 0x22c0, 82],
- [0x81, 0x36, 0xe9, 0x33, 0x2313, 333],
- [0x81, 0x37, 0x8c, 0x36, 0x246a, 10],
- [0x81, 0x37, 0x8d, 0x36, 0x249c, 100],
- [0x81, 0x37, 0x97, 0x36, 0x254c, 4],
- [0x81, 0x37, 0x98, 0x30, 0x2574, 13],
- [0x81, 0x37, 0x99, 0x33, 0x2590, 3],
- [0x81, 0x37, 0x99, 0x36, 0x2596, 10],
- [0x81, 0x37, 0x9a, 0x36, 0x25a2, 16],
- [0x81, 0x37, 0x9c, 0x32, 0x25b4, 8],
- [0x81, 0x37, 0x9d, 0x30, 0x25be, 8],
- [0x81, 0x37, 0x9d, 0x38, 0x25c8, 3],
- [0x81, 0x37, 0x9e, 0x31, 0x25cc, 2],
- [0x81, 0x37, 0x9e, 0x33, 0x25d0, 18],
- [0x81, 0x37, 0xa0, 0x31, 0x25e6, 31],
- [0x81, 0x37, 0xa3, 0x32, 0x2607, 2],
- [0x81, 0x37, 0xa3, 0x34, 0x260a, 54],
- [0x81, 0x37, 0xa8, 0x38, 0x2641, 1],
- [0x81, 0x37, 0xa8, 0x39, 0x2643, 2110],
- [0x81, 0x38, 0xfd, 0x39, 0x2e82, 2],
- [0x81, 0x38, 0xfe, 0x31, 0x2e85, 3],
- [0x81, 0x38, 0xfe, 0x34, 0x2e89, 2],
- [0x81, 0x38, 0xfe, 0x36, 0x2e8d, 10],
- [0x81, 0x39, 0x81, 0x36, 0x2e98, 15],
- [0x81, 0x39, 0x83, 0x31, 0x2ea8, 2],
- [0x81, 0x39, 0x83, 0x33, 0x2eab, 3],
- [0x81, 0x39, 0x83, 0x36, 0x2eaf, 4],
- [0x81, 0x39, 0x84, 0x30, 0x2eb4, 2],
- [0x81, 0x39, 0x84, 0x32, 0x2eb8, 3],
- [0x81, 0x39, 0x84, 0x35, 0x2ebc, 14],
- [0x81, 0x39, 0x85, 0x39, 0x2ecb, 293],
- [0x81, 0x39, 0xa3, 0x32, 0x2ffc, 4],
- [0x81, 0x39, 0xa3, 0x36, 0x3004, 1],
- [0x81, 0x39, 0xa3, 0x37, 0x3018, 5],
- [0x81, 0x39, 0xa4, 0x32, 0x301f, 2],
- [0x81, 0x39, 0xa4, 0x34, 0x302a, 20],
- [0x81, 0x39, 0xa6, 0x34, 0x303f, 2],
- [0x81, 0x39, 0xa6, 0x36, 0x3094, 7],
- [0x81, 0x39, 0xa7, 0x33, 0x309f, 2],
- [0x81, 0x39, 0xa7, 0x35, 0x30f7, 5],
- [0x81, 0x39, 0xa8, 0x30, 0x30ff, 6],
- [0x81, 0x39, 0xa8, 0x36, 0x312a, 246],
- [0x81, 0x39, 0xc1, 0x32, 0x322a, 7],
- [0x81, 0x39, 0xc1, 0x39, 0x3232, 113],
- [0x81, 0x39, 0xcd, 0x32, 0x32a4, 234],
- [0x81, 0x39, 0xe4, 0x36, 0x3390, 12],
- [0x81, 0x39, 0xe5, 0x38, 0x339f, 2],
- [0x81, 0x39, 0xe6, 0x30, 0x33a2, 34],
- [0x81, 0x39, 0xe9, 0x34, 0x33c5, 9],
- [0x81, 0x39, 0xea, 0x33, 0x33cf, 2],
- [0x81, 0x39, 0xea, 0x35, 0x33d3, 2],
- [0x81, 0x39, 0xea, 0x37, 0x33d6, 113],
- [0x81, 0x39, 0xf6, 0x30, 0x3448, 43],
- [0x81, 0x39, 0xfa, 0x33, 0x3474, 298],
- [0x82, 0x30, 0x9a, 0x31, 0x359f, 111],
- [0x82, 0x30, 0xa5, 0x32, 0x360f, 11],
- [0x82, 0x30, 0xa6, 0x33, 0x361b, 765],
- [0x82, 0x30, 0xf2, 0x38, 0x3919, 85],
- [0x82, 0x30, 0xfb, 0x33, 0x396f, 96],
- [0x82, 0x31, 0x86, 0x39, 0x39d1, 14],
- [0x82, 0x31, 0x88, 0x33, 0x39e0, 147],
- [0x82, 0x31, 0x97, 0x30, 0x3a74, 218],
- [0x82, 0x31, 0xac, 0x38, 0x3b4f, 287],
- [0x82, 0x31, 0xc9, 0x35, 0x3c6f, 113],
- [0x82, 0x31, 0xd4, 0x38, 0x3ce1, 885],
- [0x82, 0x32, 0xaf, 0x33, 0x4057, 264],
- [0x82, 0x32, 0xc9, 0x37, 0x4160, 471],
- [0x82, 0x32, 0xf8, 0x38, 0x4338, 116],
- [0x82, 0x33, 0x86, 0x34, 0x43ad, 4],
- [0x82, 0x33, 0x86, 0x38, 0x43b2, 43],
- [0x82, 0x33, 0x8b, 0x31, 0x43de, 248],
- [0x82, 0x33, 0xa3, 0x39, 0x44d7, 373],
- [0x82, 0x33, 0xc9, 0x32, 0x464d, 20],
- [0x82, 0x33, 0xcb, 0x32, 0x4662, 193],
- [0x82, 0x33, 0xde, 0x35, 0x4724, 5],
- [0x82, 0x33, 0xdf, 0x30, 0x472a, 82],
- [0x82, 0x33, 0xe7, 0x32, 0x477d, 16],
- [0x82, 0x33, 0xe8, 0x38, 0x478e, 441],
- [0x82, 0x34, 0x96, 0x39, 0x4948, 50],
- [0x82, 0x34, 0x9b, 0x39, 0x497b, 2],
- [0x82, 0x34, 0x9c, 0x31, 0x497e, 4],
- [0x82, 0x34, 0x9c, 0x35, 0x4984, 1],
- [0x82, 0x34, 0x9c, 0x36, 0x4987, 20],
- [0x82, 0x34, 0x9e, 0x36, 0x499c, 3],
- [0x82, 0x34, 0x9e, 0x39, 0x49a0, 22],
- [0x82, 0x34, 0xa1, 0x31, 0x49b8, 703],
- [0x82, 0x34, 0xe7, 0x34, 0x4c78, 39],
- [0x82, 0x34, 0xeb, 0x33, 0x4ca4, 111],
- [0x82, 0x34, 0xf6, 0x34, 0x4d1a, 148],
- [0x82, 0x35, 0x87, 0x32, 0x4daf, 81],
- [0x82, 0x35, 0x8f, 0x33, 0x9fa6, 14426],
- [0x83, 0x36, 0xc7, 0x39, 0xe76c, 1],
- [0x83, 0x36, 0xc8, 0x30, 0xe7c8, 1],
- [0x83, 0x36, 0xc8, 0x31, 0xe7e7, 13],
- [0x83, 0x36, 0xc9, 0x34, 0xe815, 1],
- [0x83, 0x36, 0xc9, 0x35, 0xe819, 5],
- [0x83, 0x36, 0xca, 0x30, 0xe81f, 7],
- [0x83, 0x36, 0xca, 0x37, 0xe827, 4],
- [0x83, 0x36, 0xcb, 0x31, 0xe82d, 4],
- [0x83, 0x36, 0xcb, 0x35, 0xe833, 8],
- [0x83, 0x36, 0xcc, 0x33, 0xe83c, 7],
- [0x83, 0x36, 0xcd, 0x30, 0xe844, 16],
- [0x83, 0x36, 0xce, 0x36, 0xe856, 14],
- [0x83, 0x36, 0xd0, 0x30, 0xe865, 4295],
- [0x84, 0x30, 0x85, 0x35, 0xf92d, 76],
- [0x84, 0x30, 0x8d, 0x31, 0xf97a, 27],
- [0x84, 0x30, 0x8f, 0x38, 0xf996, 81],
- [0x84, 0x30, 0x97, 0x39, 0xf9e8, 9],
- [0x84, 0x30, 0x98, 0x38, 0xf9f2, 26],
- [0x84, 0x30, 0x9b, 0x34, 0xfa10, 1],
- [0x84, 0x30, 0x9b, 0x35, 0xfa12, 1],
- [0x84, 0x30, 0x9b, 0x36, 0xfa15, 3],
- [0x84, 0x30, 0x9b, 0x39, 0xfa19, 6],
- [0x84, 0x30, 0x9c, 0x35, 0xfa22, 1],
- [0x84, 0x30, 0x9c, 0x36, 0xfa25, 2],
- [0x84, 0x30, 0x9c, 0x38, 0xfa2a, 1030],
- [0x84, 0x31, 0x85, 0x38, 0xfe32, 1],
- [0x84, 0x31, 0x85, 0x39, 0xfe45, 4],
- [0x84, 0x31, 0x86, 0x33, 0xfe53, 1],
- [0x84, 0x31, 0x86, 0x34, 0xfe58, 1],
- [0x84, 0x31, 0x86, 0x35, 0xfe67, 1],
- [0x84, 0x31, 0x86, 0x36, 0xfe6c, 149],
- [0x84, 0x31, 0x95, 0x35, 0xff5f, 129],
- [0x84, 0x31, 0xa2, 0x34, 0xffe6, 26],
- ];
- // We will test 4-byte codes separately
- findInvalidChars($toUnicode, $invalid, $truncated);
- function notFourByteCode($gb) {
- return ((ord($gb) < 0x81 || ord($gb) > 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) ||
- (strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39));
- }
- $invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
- $truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
- testAllValidChars($toUnicode, 'GB18030', 'UTF-16BE', false);
- testAllInvalidChars($invalid, $toUnicode, 'GB18030', 'UTF-16BE', "\x00%");
- testTruncatedChars($truncated, 'GB18030', 'UTF-16BE', "\x00%");
- echo "Tested GB18030 (1 and 2 byte characters) -> UTF-16BE\n";
- // Test one random 4-byte code for each range used for Unicode codepoints in BMP
- function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) {
- return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30);
- }
- function fourByteCodeFromIndex($index) {
- $quotient = intdiv($index, 10 * 126 * 10);
- $byte4 = $quotient + 0x81;
- $index -= ($quotient * 10 * 126 * 10);
- $quotient = intdiv($index, 10 * 126);
- $byte3 = $quotient + 0x30;
- $index -= ($quotient * 10 * 126);
- $quotient = intdiv($index, 10);
- $byte2 = $quotient + 0x81;
- $byte1 = $index - ($quotient * 10) + 0x30;
- return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1);
- }
- foreach ($gb18030_BMP_Mappings as $mapping) {
- [$byte4, $byte3, $byte2, $byte1, $unicode, $n] = $mapping;
- $i = rand(0, $n-1);
- $gb = fourByteCodeFromIndex(fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) + $i);
- $unicode += $i;
- testValidString($gb, pack('n', $unicode), 'GB18030', 'UTF-16BE');
- }
- // Invalid 4-byte codes in range for BMP
- testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030", "UTF-32BE");
- testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030", "UTF-32BE");
- testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030", "UTF-32BE");
- testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030", "UTF-32BE");
- testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030", "UTF-32BE");
- // Valid 4-byte codes for other Unicode planes
- testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030", "UTF-32BE");
- testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030", "UTF-32BE");
- // Invalid 4-byte codes for other Unicode planes
- testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030", "UTF-32BE");
- testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030", "UTF-32BE");
- testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030", "UTF-32BE");
- testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030", "UTF-32BE");
- echo "Tested GB18030 4-byte characters <-> UTF-16BE\n";
- testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030', false);
- echo "Tested UTF-16BE -> GB18030 (1 and 2 byte characters)\n";
- // Test "long" illegal character markers
- mb_substitute_character("long");
- convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030", "UTF-8");
- convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030", "UTF-8");
- echo "Done!\n";
- ?>
- --EXPECT--
- Tested GB18030 (1 and 2 byte characters) -> UTF-16BE
- Tested GB18030 4-byte characters <-> UTF-16BE
- Tested UTF-16BE -> GB18030 (1 and 2 byte characters)
- Done!
|