hz_encoding.phpt 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. --TEST--
  2. Exhaustive test of verification and conversion of HZ text
  3. --EXTENSIONS--
  4. mbstring
  5. --SKIPIF--
  6. <?php
  7. if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
  8. ?>
  9. --FILE--
  10. <?php
  11. include('encoding_tests.inc');
  12. srand(1000); // Make results consistent
  13. mb_substitute_character(0x25); // '%'
  14. for ($i = 0; $i < 0x80; $i++) {
  15. if ($i != 0x7E) // ~ is special and will be tested separately
  16. testValidString(chr($i), chr($i), 'ASCII', 'HZ');
  17. }
  18. echo "Tested ASCII -> HZ\n";
  19. for ($i = 0; $i < 0x80; $i++) {
  20. if ($i != 0x7E)
  21. testValidString(chr($i), chr($i), 'HZ', 'ASCII');
  22. }
  23. echo "Tested HZ -> ASCII\n";
  24. for ($i = 0x80; $i < 0xFF; $i++) {
  25. testInvalidString(chr($i), '%', 'HZ', 'ASCII');
  26. }
  27. echo "Tested non-ASCII bytes in ASCII mode\n";
  28. testValidString('~~', '~', 'HZ', 'ASCII');
  29. testValidString("~\n", '', 'HZ', 'ASCII', false);
  30. testValidString('~{~}', '', 'HZ', 'ASCII', false);
  31. testValidString("~{~\n~}", '', 'HZ', 'ASCII', false);
  32. echo "Tested valid ~ escapes\n";
  33. for ($i = 0; $i < 0xFF; $i++) {
  34. if ($i != 0x0A) {
  35. // Try invalid ~ escapes both in ASCII and GB modes
  36. if ($i != 0x7E && $i != 0x7B) // not {
  37. testInvalidString("~" . chr($i), '%', 'HZ', 'ASCII');
  38. if ($i != 0x7D) // not }
  39. testInvalidString("~{~" . chr($i) . "~}", '%', 'HZ', 'ASCII');
  40. }
  41. }
  42. echo "Tested all invalid ~ escapes\n";
  43. readConversionTable(__DIR__ . '/data/GB2312.txt', $toUnicode, $fromUnicode);
  44. findInvalidChars($toUnicode, $invalid, $truncated);
  45. // Two characters in ISO-2022-CN convert to Unicode 0x2225
  46. $irreversible = ["\x21\x2C" => true];
  47. // Test all good GB2312 characters within ~{ ~} escapes
  48. $goodChars = array_keys($toUnicode);
  49. shuffle($goodChars);
  50. while (!empty($goodChars)) {
  51. $reversible = true;
  52. $length = 1; //min(rand(5,10), count($goodChars));
  53. $fromString = $toString = '';
  54. while ($length--) {
  55. $goodChar = array_pop($goodChars);
  56. $fromString .= $goodChar;
  57. $toString .= $toUnicode[$goodChar];
  58. if (isset($irreversible[$goodChar]))
  59. $reversible = false;
  60. }
  61. testValidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE', $reversible);
  62. }
  63. // Test all invalid GB2312 characters within ~{ ~} escapes
  64. // However, don't test escape sequences; we will do those separately below
  65. unset($invalid["~"]);
  66. $badChars = array_keys($invalid);
  67. $goodChars = array();
  68. while (!empty($badChars)) {
  69. if (empty($goodChars)) {
  70. $goodChars = array_keys($toUnicode);
  71. shuffle($goodChars);
  72. }
  73. $goodChar = array_pop($goodChars);
  74. $fromString = array_pop($badChars) . $goodChar;
  75. $toString = "\x00%" . $toUnicode[$goodChar];
  76. testInvalidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE');
  77. }
  78. $truncatedChars = array_keys($truncated);
  79. foreach ($truncatedChars as $truncatedChar) {
  80. testInvalidString('~{' . $truncatedChar, "\x00%", 'HZ', 'UTF-16BE');
  81. }
  82. echo "Tested HZ -> UTF-16BE (for all GB2312 characters)\n";
  83. findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
  84. // Although they do not appear in the Unicode -> GB2312 map, ASCII characters *are*
  85. // valid to convert to HZ
  86. for ($i = 0; $i <= 0x7F; $i++)
  87. unset($invalid["\x00" . chr($i)]);
  88. $badChars = array_keys($invalid);
  89. $goodChars = array();
  90. while (!empty($badChars)) {
  91. if (empty($goodChars)) {
  92. $goodChars = array_keys($fromUnicode);
  93. shuffle($goodChars);
  94. }
  95. $goodChar = array_pop($goodChars);
  96. $fromString = array_pop($badChars) . $goodChar;
  97. $toString = "%~{" . $fromUnicode[$goodChar] . "~}";
  98. convertInvalidString($fromString, $toString, 'UTF-16BE', 'HZ');
  99. }
  100. echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";
  101. // Test "long" illegal character markers
  102. mb_substitute_character("long");
  103. convertInvalidString("~A", "%", "HZ", "UTF-8");
  104. convertInvalidString("\x80", "%", "HZ", "UTF-8");
  105. convertInvalidString("~{\x22\x21", "%", "HZ", "UTF-8");
  106. echo "Done!\n";
  107. ?>
  108. --EXPECT--
  109. Tested ASCII -> HZ
  110. Tested HZ -> ASCII
  111. Tested non-ASCII bytes in ASCII mode
  112. Tested valid ~ escapes
  113. Tested all invalid ~ escapes
  114. Tested HZ -> UTF-16BE (for all GB2312 characters)
  115. Tested UTF-16BE -> HZ (for all GB2312 characters)
  116. Done!