123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- --TEST--
- Exhaustive test of verification and conversion of HZ text
- --EXTENSIONS--
- mbstring
- --SKIPIF--
- <?php
- if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
- ?>
- --FILE--
- <?php
- include('encoding_tests.inc');
- srand(1000); // Make results consistent
- mb_substitute_character(0x25); // '%'
- for ($i = 0; $i < 0x80; $i++) {
- if ($i != 0x7E) // ~ is special and will be tested separately
- testValidString(chr($i), chr($i), 'ASCII', 'HZ');
- }
- echo "Tested ASCII -> HZ\n";
- for ($i = 0; $i < 0x80; $i++) {
- if ($i != 0x7E)
- testValidString(chr($i), chr($i), 'HZ', 'ASCII');
- }
- echo "Tested HZ -> ASCII\n";
- for ($i = 0x80; $i < 0xFF; $i++) {
- testInvalidString(chr($i), '%', 'HZ', 'ASCII');
- }
- echo "Tested non-ASCII bytes in ASCII mode\n";
- testValidString('~~', '~', 'HZ', 'ASCII');
- testValidString("~\n", '', 'HZ', 'ASCII', false);
- testValidString('~{~}', '', 'HZ', 'ASCII', false);
- testValidString("~{~\n~}", '', 'HZ', 'ASCII', false);
- echo "Tested valid ~ escapes\n";
- for ($i = 0; $i < 0xFF; $i++) {
- if ($i != 0x0A) {
- // Try invalid ~ escapes both in ASCII and GB modes
- if ($i != 0x7E && $i != 0x7B) // not {
- testInvalidString("~" . chr($i), '%', 'HZ', 'ASCII');
- if ($i != 0x7D) // not }
- testInvalidString("~{~" . chr($i) . "~}", '%', 'HZ', 'ASCII');
- }
- }
- echo "Tested all invalid ~ escapes\n";
- readConversionTable(__DIR__ . '/data/GB2312.txt', $toUnicode, $fromUnicode);
- findInvalidChars($toUnicode, $invalid, $truncated);
- // Two characters in ISO-2022-CN convert to Unicode 0x2225
- $irreversible = ["\x21\x2C" => true];
- // Test all good GB2312 characters within ~{ ~} escapes
- $goodChars = array_keys($toUnicode);
- shuffle($goodChars);
- while (!empty($goodChars)) {
- $reversible = true;
- $length = 1; //min(rand(5,10), count($goodChars));
- $fromString = $toString = '';
- while ($length--) {
- $goodChar = array_pop($goodChars);
- $fromString .= $goodChar;
- $toString .= $toUnicode[$goodChar];
- if (isset($irreversible[$goodChar]))
- $reversible = false;
- }
- testValidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE', $reversible);
- }
- // Test all invalid GB2312 characters within ~{ ~} escapes
- // However, don't test escape sequences; we will do those separately below
- unset($invalid["~"]);
- $badChars = array_keys($invalid);
- $goodChars = array();
- while (!empty($badChars)) {
- if (empty($goodChars)) {
- $goodChars = array_keys($toUnicode);
- shuffle($goodChars);
- }
- $goodChar = array_pop($goodChars);
- $fromString = array_pop($badChars) . $goodChar;
- $toString = "\x00%" . $toUnicode[$goodChar];
- testInvalidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE');
- }
- $truncatedChars = array_keys($truncated);
- foreach ($truncatedChars as $truncatedChar) {
- testInvalidString('~{' . $truncatedChar, "\x00%", 'HZ', 'UTF-16BE');
- }
- echo "Tested HZ -> UTF-16BE (for all GB2312 characters)\n";
- findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
- // Although they do not appear in the Unicode -> GB2312 map, ASCII characters *are*
- // valid to convert to HZ
- for ($i = 0; $i <= 0x7F; $i++)
- unset($invalid["\x00" . chr($i)]);
- $badChars = array_keys($invalid);
- $goodChars = array();
- while (!empty($badChars)) {
- if (empty($goodChars)) {
- $goodChars = array_keys($fromUnicode);
- shuffle($goodChars);
- }
- $goodChar = array_pop($goodChars);
- $fromString = array_pop($badChars) . $goodChar;
- $toString = "%~{" . $fromUnicode[$goodChar] . "~}";
- convertInvalidString($fromString, $toString, 'UTF-16BE', 'HZ');
- }
- echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";
- // Test "long" illegal character markers
- mb_substitute_character("long");
- convertInvalidString("~A", "%", "HZ", "UTF-8");
- convertInvalidString("\x80", "%", "HZ", "UTF-8");
- convertInvalidString("~{\x22\x21", "%", "HZ", "UTF-8");
- echo "Done!\n";
- ?>
- --EXPECT--
- Tested ASCII -> HZ
- Tested HZ -> ASCII
- Tested non-ASCII bytes in ASCII mode
- Tested valid ~ escapes
- Tested all invalid ~ escapes
- Tested HZ -> UTF-16BE (for all GB2312 characters)
- Tested UTF-16BE -> HZ (for all GB2312 characters)
- Done!
|