encoding_tests.inc 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. <?php
  2. // Common code for tests which focus on conversion and verification of text
  3. // in some specific encoding
  4. // Read a file with one character and its equivalent Unicode codepoint on each
  5. // line, delimited by tabs
  6. function readConversionTable($path, &$from, &$to, $utf32 = false) {
  7. $from = array();
  8. $to = array();
  9. $fp = fopen($path, 'r+');
  10. while ($line = fgets($fp, 256)) {
  11. if ($line[0] == '#')
  12. continue;
  13. if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
  14. $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
  15. if ($char == PHP_INT_MAX) {
  16. // We may be on a 32-bit machine and testing a text encoding with 4-byte codes
  17. // (which can't be represented in a PHP integer)
  18. $char = "";
  19. for ($i = 2; $i < strlen($line); $i += 2) {
  20. $substr = substr($line, $i, 2);
  21. if (ctype_xdigit($substr))
  22. $char .= chr(hexdec($substr));
  23. else
  24. break;
  25. }
  26. } else {
  27. if ($char <= 0xFF)
  28. $char = chr($char); // hex codes must not have leading zero bytes
  29. else if ($char <= 0xFFFF)
  30. $char = pack('n', $char);
  31. else if ($char <= 0xFFFFFF)
  32. $char = chr($char >> 16) . pack('n', $char & 0xFFFF);
  33. else
  34. $char = pack('N', $char);
  35. }
  36. $from[$char] = $codepoint;
  37. $to[$codepoint] = $char;
  38. }
  39. }
  40. }
  41. function dbgPrint($str) {
  42. $result = '';
  43. if (mb_check_encoding($str, 'ASCII'))
  44. $result .= '"' . $str . '" ';
  45. return $result . "(" . bin2hex($str) . ")";
  46. }
  47. function identifyValidString($goodString, $encoding) {
  48. $result = mb_check_encoding($goodString, $encoding);
  49. if (!$result)
  50. die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString));
  51. }
  52. function identifyInvalidString($badString, $encoding) {
  53. $result = mb_check_encoding($badString, $encoding);
  54. if ($result)
  55. die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString));
  56. }
  57. function testConversion($fromString, $toString, $fromEncoding, $toEncoding) {
  58. $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding);
  59. if ($result !== $toString)
  60. die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result));
  61. }
  62. function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) {
  63. $illegalChars = mb_get_info('illegal_chars');
  64. testConversion($fromString, $toString, $fromEncoding, $toEncoding);
  65. if (mb_get_info('illegal_chars') !== $illegalChars)
  66. die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
  67. }
  68. function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
  69. testValidConversion($fromString, $toString, $fromEncoding, $toEncoding);
  70. if ($bothWays)
  71. testValidConversion($toString, $fromString, $toEncoding, $fromEncoding);
  72. }
  73. function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
  74. $illegalChars = mb_get_info('illegal_chars');
  75. testConversion($fromString, $toString, $fromEncoding, $toEncoding);
  76. if (mb_get_info('illegal_chars') <= $illegalChars)
  77. die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
  78. }
  79. function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
  80. identifyValidString($fromString, $fromEncoding);
  81. convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
  82. }
  83. function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
  84. identifyInvalidString($fromString, $fromEncoding);
  85. convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
  86. }
  87. // Only for encodings where valid characters can be concatenated together in any
  88. // way, without any escape sequences
  89. function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) {
  90. $goodChars = array_keys($charMap);
  91. shuffle($goodChars);
  92. while (!empty($goodChars)) {
  93. $length = min(rand(5,10), count($goodChars));
  94. $fromString = $toString = '';
  95. while ($length--) {
  96. $goodChar = array_pop($goodChars);
  97. $fromString .= $goodChar;
  98. $toString .= $charMap[$goodChar];
  99. }
  100. testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
  101. }
  102. }
  103. function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
  104. $badChars = array_keys($badChars);
  105. $goodChars = array();
  106. while (!empty($badChars)) {
  107. if (empty($goodChars)) {
  108. $goodChars = array_keys($charMap);
  109. shuffle($goodChars);
  110. }
  111. $goodChar = array_pop($goodChars);
  112. $fromString = array_pop($badChars) . $goodChar;
  113. $toString = $replacement . $charMap[$goodChar];
  114. testInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
  115. }
  116. }
  117. function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
  118. $badChars = array_keys($badChars);
  119. $goodChars = array();
  120. while (!empty($badChars)) {
  121. if (empty($goodChars)) {
  122. $goodChars = array_keys($charMap);
  123. shuffle($goodChars);
  124. }
  125. $goodChar = array_pop($goodChars);
  126. $fromString = array_pop($badChars) . $goodChar;
  127. $toString = $replacement . $charMap[$goodChar];
  128. convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
  129. }
  130. }
  131. function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) {
  132. $truncatedChars = array_keys($truncated);
  133. foreach ($truncatedChars as $truncatedChar) {
  134. testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding);
  135. }
  136. }
  137. // For variable-width encodings, where we have an exhaustive list of
  138. // all valid characters of any width
  139. //
  140. // `$startBytes` maps from first-byte values to the corresponding character length
  141. // (For encodings where the first byte can tell you the length of a multi-byte
  142. // character)
  143. // Note that `$startBytes` can be partial!
  144. function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) {
  145. $invalid = array();
  146. $truncated = array();
  147. $prefixes = array(); /* All sequences which are not (but can start) a valid character */
  148. foreach ($valid as $char => $unicode) {
  149. for ($len = 1; $len < strlen($char); $len++)
  150. $prefixes[substr($char, 0, $len)] = true;
  151. }
  152. $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) {
  153. for ($byte = 0; $byte < 256; $byte++) {
  154. $str = $prefix . chr($byte);
  155. if (!isset($valid[$str])) {
  156. if (isset($prefixes[$str])) {
  157. $truncated[$str] = true;
  158. $varLength($str);
  159. } else {
  160. $invalid[$str] = true;
  161. }
  162. }
  163. }
  164. };
  165. $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) {
  166. if ($remaining == 0) {
  167. if (!isset($valid[$prefix]))
  168. $invalid[$prefix] = true;
  169. } else if ($remaining == 1) {
  170. $truncated[$prefix] = true;
  171. for ($i = 0; $i < 256; $i++) {
  172. $str = $prefix . chr($i);
  173. if (!isset($valid[$str]))
  174. $invalid[$str] = true;
  175. }
  176. } else {
  177. $truncated[$prefix] = true;
  178. for ($i = 0; $i < 256; $i++)
  179. $fixedLength($prefix . chr($i), $remaining - 1);
  180. }
  181. };
  182. for ($byte = 0; $byte < 256; $byte++) {
  183. if (isset($startBytes[$byte])) {
  184. $fixedLength(chr($byte), $startBytes[$byte] - 1);
  185. } else {
  186. $str = chr($byte);
  187. if (!isset($valid[$str])) {
  188. if (isset($prefixes[$str])) {
  189. $truncated[$str] = true;
  190. $varLength($str);
  191. } else {
  192. $invalid[$str] = true;
  193. }
  194. }
  195. }
  196. }
  197. }
  198. function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) {
  199. srand(1000); // Make results consistent
  200. mb_substitute_character(0x25); // '%'
  201. readConversionTable($path, $toUnicode, $fromUnicode);
  202. findInvalidChars($toUnicode, $invalid, $truncated, $startBytes);
  203. testAllValidChars($toUnicode, $encoding, 'UTF-16BE');
  204. testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%");
  205. testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%");
  206. echo "Tested $encoding -> UTF-16BE\n";
  207. findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
  208. convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement);
  209. echo "Tested UTF-16BE -> $encoding\n";
  210. }
  211. ?>