utf7imap_encoding.phpt 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. --TEST--
  2. Exhaustive test of mUTF-7 (IMAP) encoding verification and conversion
  3. --EXTENSIONS--
  4. mbstring
  5. --FILE--
  6. <?php
  7. include('encoding_tests.inc');
  8. mb_substitute_character(0x25); // '%'
  9. function utf16BE($utf8) {
  10. return mb_convert_encoding($utf8, 'UTF-16BE', 'UTF-8');
  11. }
  12. function mBase64($str) {
  13. return str_replace('=', '', str_replace('/', ',', base64_encode($str)));
  14. }
  15. function testValid($from, $to, $bothWays = true) {
  16. testValidString($from, $to, 'UTF7-IMAP', 'UTF-8', $bothWays);
  17. }
  18. function testInvalid($from, $to) {
  19. testInvalidString($from, $to, 'UTF7-IMAP', 'UTF-8');
  20. }
  21. /* An empty string is valid */
  22. testValid("", "");
  23. echo "Identification passes on empty string... good start!\n";
  24. /* Identification and conversion of ASCII characters (minus &) */
  25. for ($i = 0x20; $i <= 0x7E; $i++) {
  26. if ($i == 0x26) // '&'
  27. continue;
  28. testValid(chr($i), chr($i));
  29. }
  30. echo "Testing all valid single-character ASCII strings... check!\n";
  31. /* Identification and conversion of non-ASCII characters */
  32. for ($i = 0; $i < 0x20; $i++)
  33. testInvalid(chr($i), "%");
  34. for ($i = 0x7F; $i < 256; $i++)
  35. testInvalid(chr($i), "%");
  36. echo "Non-ASCII characters convert to illegal char marker... yes!\n";
  37. /* Identification of '&' when Base-64 encoded */
  38. testValid("&" . mBase64(utf16BE("&")) . "-", "&", false);
  39. echo "& can be Base64-encoded... yes!\n";
  40. /* Identification of unterminated & section */
  41. identifyInvalidString("&", 'UTF7-IMAP');
  42. identifyInvalidString("abc&", 'UTF7-IMAP');
  43. identifyInvalidString("&" . mBase64(utf16BE("ハムサンドイッチ")), 'UTF7-IMAP');
  44. echo "Testing unterminated & sections... yep!\n";
  45. /* Identification of null shifts (& immediately after -)
  46. *
  47. * This is illegal according to the spec for mUTF-7 (IMAP), but currently we are letting
  48. * it pass... among other things, this makes it possible to concatenate UTF-7-IMAP
  49. * strings naively without the concatenated strings being treated as 'invalid'
  50. *
  51. * If ever we want to enforce this part of the spec, uncomment the following test */
  52. /*
  53. identifyInvalidString("&" . mBase64(utf16BE("肉包子")) . "-&" . mBase64(utf16BE("冰淇淋")) . "-", 'UTF7-IMAP');
  54. echo "Testing consecutive & sections which should have been merged... yep!\n";
  55. */
  56. /* Conversion of Base64-encoded ASCII characters (excluding &)
  57. * These should be treated as erroneous and mb_substitute_character should apply */
  58. for ($i = 0x20; $i <= 0x7E; $i++) {
  59. if ($i == 0x26) // '&'
  60. continue;
  61. testInvalid("&" . mBase64(utf16BE(chr($i))) . "-", "%");
  62. }
  63. echo "Testing ASCII characters which are Base64-encoded... great!\n";
  64. /* Conversion of & encoded as &- */
  65. testValid("&-", "&");
  66. testValid("abc&-", "abc&");
  67. testValid("&-.&-", "&.&");
  68. echo "Testing valid strings which use '&-' for '&'... good!\n";
  69. /* Identification of & sections containing non-Base64 */
  70. /* We'll use 6 character strings as a test, since 6 UTF-16 characters is just enough
  71. * to fit perfectly in Base64 encoding, with no padding */
  72. $testString = mBase64(utf16BE("我是打酱油的"));
  73. if (strlen($testString) != 16)
  74. die("Erk!!");
  75. for ($i = 0; $i < 256; $i++) {
  76. if ($i >= 0x30 && $i <= 0x39) // '0'..'9'
  77. continue;
  78. if ($i >= 0x41 && $i <= 0x5A) // 'A'..'Z'
  79. continue;
  80. if ($i >= 0x61 && $i <= 0x7A) // 'a'..'z'
  81. continue;
  82. if ($i == 0x2B || $i == 0x2C) // '+' or ','
  83. continue;
  84. if ($i == 0x2D) // '-'... this will be interpreted as ending the Base64 section
  85. continue;
  86. identifyInvalidString("&" . substr($testString, 0, 11) . chr($i) . "-", 'UTF7-IMAP');
  87. }
  88. echo "Identification fails when Base64 sections contain non-Base64 bytes... right!\n";
  89. /* Tell me, please, how many ways can UTF-16BE text get messed up?
  90. * Why, that's elementary... */
  91. /* 1. The second half of a surrogate pair could come first, */
  92. $testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
  93. if (strlen($testString) != 4)
  94. die("Ouch!");
  95. $testString = substr($testString, 2, 2) . substr($testString, 0, 2);
  96. identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
  97. /* (Or could appear by itself) */
  98. $testString = substr($testString, 0, 2);
  99. identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
  100. /* ...and we should detect this wherever it occurs */
  101. $singleChar = mb_convert_encoding("1", 'UTF-16BE', 'ASCII');
  102. $doubleChar = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE');
  103. if (strlen($doubleChar) != 4)
  104. die("That was supposed to be a surrogate pair");
  105. identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP');
  106. identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
  107. identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
  108. identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP');
  109. identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP');
  110. identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP');
  111. /* 2. The first half of a surrogate pair might be followed by an invalid 2nd part, */
  112. $testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
  113. $testString = substr($testString, 0, 2) . mb_convert_encoding("a", 'UTF-16BE', 'ASCII');
  114. identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
  115. /* ...and we should also detect that wherever it occurs... */
  116. identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP');
  117. identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
  118. identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP');
  119. /* 3. The first half of a surrogate pair could come at the end of the string, */
  120. $testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
  121. identifyInvalidString("&" . mBase64(substr($testString, 0, 2)) . "-", 'UTF7-IMAP');
  122. identifyInvalidString("&" . mBase64($singleChar . substr($testString, 0, 2)) . "-", 'UTF7-IMAP');
  123. identifyInvalidString("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)) . "-", 'UTF7-IMAP');
  124. /* 4. Or, it could have an odd number of bytes in it! */
  125. $testString = utf16BE("ドーナツ");
  126. $testString = substr($testString, 0, strlen($testString) - 1);
  127. identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
  128. /* And there is one bonus way to discombobulate your UTF-16BE when it is Base64-encoded...
  129. * The Base64 might not decode to an integral number of bytes
  130. * Or, equivalently... it might not be padded with zeroes (as the RFC requires) */
  131. $testString = utf16BE("☺⛑");
  132. if (strlen($testString) != 4)
  133. die("No good");
  134. $encoded = mBase64($testString);
  135. if (strlen($encoded) != 6)
  136. die("Don't like that");
  137. /* Mess up the padding by replacing the last Base64 character with ',',
  138. * which represents 63 (a number with a 1 in the last bit) */
  139. identifyInvalidString("&" . substr($encoded, 0, strlen($encoded) - 1) . ",-", 'UTF7-IMAP');
  140. echo "Identification fails when UTF-16 text is invalid... no sweat!\n";
  141. /* OK, let's try valid Base64-encoded text now */
  142. /* 2-byte char */
  143. testValid("&" . mBase64(utf16BE("☺")) . "-", "☺");
  144. /* 2 + 2 */
  145. testValid("&" . mBase64(utf16BE("饺子")) . "-", "饺子");
  146. /* 2 + 2 + 2 */
  147. testValid("&" . mBase64(utf16BE("123")) . "-", "123");
  148. /* 2 + 2 + 2 + 2 */
  149. testValid("&" . mBase64(utf16BE("ᄚᄆᄇᄈ")) . "-", "ᄚᄆᄇᄈ");
  150. /* 4 */
  151. $longChar1 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE');
  152. $longChar2 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-8', 'UTF-32BE');
  153. testValid("&" . mBase64($longChar1) . "-", $longChar2);
  154. /* 2 + 4 */
  155. testValid("&" . mBase64(utf16BE("饼") . $longChar1) . "-", "饼" . $longChar2);
  156. /* 4 + 2 */
  157. testValid("&" . mBase64($longChar1 . utf16BE("饼")) . "-", $longChar2 . "饼");
  158. /* 2 + 4 + 2 */
  159. testValid("&" . mBase64(utf16BE("☺") . $longChar1 . utf16BE("饼")) . "-", "☺" . $longChar2 . "饼");
  160. /* 2 + 2 + 4 */
  161. testValid("&" . mBase64(utf16BE("西瓜") . $longChar1) . "-", "西瓜" . $longChar2);
  162. /* 2 + 2 + 4 + 2 */
  163. testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . utf16BE("☺")) . "-", "西瓜" . $longChar2 . "☺");
  164. /* 2 + 2 + 4 + 4 */
  165. testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . $longChar1) . "-", "西瓜" . $longChar2 . $longChar2);
  166. /* 2 + 2 + 2 + 4 */
  167. testValid("&" . mBase64(utf16BE("西红柿") . $longChar1) . "-", "西红柿" . $longChar2);
  168. /* Multiple sections of valid ASCII _and_ Base64-encoded text */
  169. testValid("123&" . mBase64(utf16BE("123")) . "-abc&" . mBase64(utf16BE("☺")) . "-.", "123123abc☺.");
  170. /* If a & character appears right after a non-ASCII character, we must first close the Base64
  171. * section and then emit &- */
  172. testValidString("☺&", "&Jjo-&-", "UTF-8", "UTF7-IMAP", false);
  173. testValidString("西瓜&", "&iX903A-&-", "UTF-8", "UTF7-IMAP", false);
  174. testValidString("西红柿&", "&iX9+omf,-&-", "UTF-8", "UTF7-IMAP", false);
  175. echo "Identification and conversion of valid text is working... perfect!\n";
  176. // Try illegal Unicode codepoint (> 0x10FFFF)
  177. convertInvalidString("\x00\x20\x00\x00", "%", "UCS-4BE", "UTF7-IMAP");
  178. // Test "long" illegal character markers
  179. mb_substitute_character("long");
  180. convertInvalidString("\x10", "%", "UTF7-IMAP", "UTF-8");
  181. convertInvalidString("\x80", "%", "UTF7-IMAP", "UTF-8");
  182. convertInvalidString("abc&", "abc%", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it
  183. convertInvalidString("&**-", "%*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal
  184. echo "Done!\n";
  185. ?>
  186. --EXPECT--
  187. Identification passes on empty string... good start!
  188. Testing all valid single-character ASCII strings... check!
  189. Non-ASCII characters convert to illegal char marker... yes!
  190. & can be Base64-encoded... yes!
  191. Testing unterminated & sections... yep!
  192. Testing ASCII characters which are Base64-encoded... great!
  193. Testing valid strings which use '&-' for '&'... good!
  194. Identification fails when Base64 sections contain non-Base64 bytes... right!
  195. Identification fails when UTF-16 text is invalid... no sweat!
  196. Identification and conversion of valid text is working... perfect!
  197. Done!