utf8.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. #include <CUnit/CUnit.h>
  2. #include <CUnit/Basic.h>
  3. #include "mosquitto.h"
  4. /* Test data taken from
  5. * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt but modified for
  6. * updated standard (no 5, 6 byte lengths) */
  7. static void utf8_helper_len(const char *text, int len, int expected)
  8. {
  9. int result;
  10. result = mosquitto_validate_utf8(text, len);
  11. CU_ASSERT_EQUAL(result, expected);
  12. }
  13. static void utf8_helper(const char *text, int expected)
  14. {
  15. utf8_helper_len(text, (int)strlen(text), expected);
  16. }
  17. static void TEST_utf8_empty(void)
  18. {
  19. utf8_helper_len(NULL, 0, MOSQ_ERR_INVAL);
  20. }
  21. static void TEST_utf8_valid(void)
  22. {
  23. /* 1 Some correct UTF-8 text */
  24. utf8_helper("", MOSQ_ERR_SUCCESS);
  25. utf8_helper("You should see the Greek word 'kosme': \"κόσμε\"", MOSQ_ERR_SUCCESS);
  26. }
  27. static void TEST_utf8_truncated(void)
  28. {
  29. uint8_t buf[4];
  30. /* As per boundary condition tests, but less one character */
  31. buf[0] = 0xC2; buf[1] = 0;
  32. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  33. buf[0] = 0xE0; buf[1] = 0xA0; buf[2] = 0;
  34. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  35. buf[0] = 0xF0; buf[1] = 0x90; buf[2] = 0x80; buf[3] = 0;
  36. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  37. }
  38. static void TEST_utf8_boundary_conditions(void)
  39. {
  40. /* 2 Boundary condition test cases */
  41. /* 2.1 First possible sequence of a certain length */
  42. utf8_helper_len("2.1.1 1 byte (U-00000000): \"\0\"", 39, MOSQ_ERR_MALFORMED_UTF8);
  43. utf8_helper("2.1.2 2 bytes (U-00000080): \"€\"", MOSQ_ERR_MALFORMED_UTF8); /* control char */
  44. utf8_helper("2.1.3 3 bytes (U-00000800): \"à €\"", MOSQ_ERR_SUCCESS);
  45. utf8_helper("2.1.4 4 bytes (U-00010000): \"�\"", MOSQ_ERR_SUCCESS);
  46. /* 2.2 Last possible sequence of a certain length */
  47. utf8_helper("2.2.1 1 byte (U-0000007F): \"\"", MOSQ_ERR_MALFORMED_UTF8); /* control char */
  48. utf8_helper("2.2.2 2 bytes (U-000007FF): \"ß¿\"", MOSQ_ERR_SUCCESS);
  49. /* Non character */
  50. utf8_helper("2.2.3 3 bytes (U-0000FFFF): \"ï¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  51. /* Non character */
  52. utf8_helper("2.2.4 4 bytes (U-0010FFFF): \"÷¿¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  53. /* 2.3 Other boundary conditions */
  54. utf8_helper("2.3.1 U-0000D7FF = ed 9f bf = \"퟿\"", MOSQ_ERR_SUCCESS);
  55. utf8_helper("2.3.2 U-0000E000 = ee 80 80 = \"\"", MOSQ_ERR_SUCCESS);
  56. utf8_helper("2.3.3 U-0000FFFD = ef bf bd = \"�\"", MOSQ_ERR_SUCCESS);
  57. /* Non character */
  58. utf8_helper("2.3.4 U-0010FFFF = f4 8f bf bf = \"�\"", MOSQ_ERR_MALFORMED_UTF8);
  59. /* This used to be valid in pre-2003 utf-8 */
  60. utf8_helper("2.3.5 U-00110000 = f4 90 80 80 = \"�\"", MOSQ_ERR_MALFORMED_UTF8);
  61. }
  62. static void TEST_utf8_malformed_sequences(void)
  63. {
  64. uint8_t buf[100];
  65. int i;
  66. /* 3 Malformed sequences */
  67. /* 3.1 Unexpected continuation bytes */
  68. utf8_helper("3.1.1 First continuation byte 0x80: \"€\"", MOSQ_ERR_MALFORMED_UTF8);
  69. utf8_helper("3.1.2 Last continuation byte 0xbf: \"¿\"", MOSQ_ERR_MALFORMED_UTF8);
  70. utf8_helper("3.1.3 2 continuation bytes: \"€¿\"", MOSQ_ERR_MALFORMED_UTF8);
  71. utf8_helper("3.1.4 3 continuation bytes: \"€¿€\"", MOSQ_ERR_MALFORMED_UTF8);
  72. utf8_helper("3.1.5 4 continuation bytes: \"€¿€¿\"", MOSQ_ERR_MALFORMED_UTF8);
  73. utf8_helper("3.1.6 5 continuation bytes: \"€¿€¿€\"", MOSQ_ERR_MALFORMED_UTF8);
  74. utf8_helper("3.1.7 6 continuation bytes: \"€¿€¿€¿\"", MOSQ_ERR_MALFORMED_UTF8);
  75. utf8_helper("3.1.8 7 continuation bytes: \"€¿€¿€¿€\"", MOSQ_ERR_MALFORMED_UTF8);
  76. /* 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): */
  77. memset(buf, 0, sizeof(buf));
  78. for(i=0x80; i<0x90; i++){
  79. buf[i-0x80] = (uint8_t)i;
  80. }
  81. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  82. memset(buf, 0, sizeof(buf));
  83. for(i=0x90; i<0xa0; i++){
  84. buf[i-0x90] = (uint8_t)i;
  85. }
  86. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  87. for(i=0x80; i<0xA0; i++){
  88. buf[0] = (uint8_t)i;
  89. buf[1] = 0;
  90. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  91. }
  92. for(i=0xA0; i<0xC0; i++){
  93. buf[0] = (uint8_t)i;
  94. buf[1] = 0;
  95. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  96. }
  97. /* 3.2 Lonely start characters */
  98. /* 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf),
  99. each followed by a space character: */
  100. utf8_helper("À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß ", MOSQ_ERR_MALFORMED_UTF8);
  101. for(i=0xC0; i<0xE0; i++){
  102. buf[0] = (uint8_t)i;
  103. buf[1] = ' ';
  104. buf[2] = 0;
  105. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  106. }
  107. /* 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef),
  108. each followed by a space character: */
  109. utf8_helper("\"à á â ã ä å æ ç è é ê ë ì í î ï \"", MOSQ_ERR_MALFORMED_UTF8);
  110. for(i=0xe0; i<0xf0; i++){
  111. buf[0] = (uint8_t)i;
  112. buf[1] = ' ';
  113. buf[2] = 0;
  114. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  115. }
  116. /* 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7),
  117. each followed by a space character: */
  118. utf8_helper("\"ð ñ ò ó ô õ ö ÷ \"", MOSQ_ERR_MALFORMED_UTF8);
  119. for(i=0xF0; i<0xF8; i++){
  120. buf[0] = (uint8_t)i;
  121. buf[1] = ' ';
  122. buf[2] = 0;
  123. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  124. }
  125. /* 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb),
  126. each followed by a space character: */
  127. utf8_helper("\"ø ù ú û \"", MOSQ_ERR_MALFORMED_UTF8);
  128. for(i=0xF8; i<0xFC; i++){
  129. buf[0] = (uint8_t)i;
  130. buf[1] = ' ';
  131. buf[2] = 0;
  132. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  133. }
  134. /* 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd),
  135. each followed by a space character: */
  136. utf8_helper("\"ü ý \"", MOSQ_ERR_MALFORMED_UTF8);
  137. utf8_helper("ü ", MOSQ_ERR_MALFORMED_UTF8);
  138. utf8_helper("ý ", MOSQ_ERR_MALFORMED_UTF8);
  139. for(i=0xFC; i<0xFE; i++){
  140. buf[0] = (uint8_t)i;
  141. buf[1] = ' ';
  142. buf[2] = 0;
  143. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  144. }
  145. /* 3.3 Sequences with last continuation byte missing
  146. All bytes of an incomplete sequence should be signalled as a single
  147. malformed sequence, i.e., you should see only a single replacement
  148. character in each of the next 10 tests. (Characters as in section 2) */
  149. utf8_helper("3.3.1 2-byte sequence with last byte missing (U+0000): \"À\"", MOSQ_ERR_MALFORMED_UTF8);
  150. utf8_helper("3.3.2 3-byte sequence with last byte missing (U+0000): \"à€\"", MOSQ_ERR_MALFORMED_UTF8);
  151. utf8_helper("3.3.3 4-byte sequence with last byte missing (U+0000): \"ð€€\"", MOSQ_ERR_MALFORMED_UTF8);
  152. utf8_helper("3.3.4 5-byte sequence with last byte missing (U+0000): \"ø€€€\"", MOSQ_ERR_MALFORMED_UTF8);
  153. utf8_helper("3.3.5 6-byte sequence with last byte missing (U+0000): \"ü€€€€\"", MOSQ_ERR_MALFORMED_UTF8);
  154. utf8_helper("3.3.6 2-byte sequence with last byte missing (U-000007FF): \"ß\"", MOSQ_ERR_MALFORMED_UTF8);
  155. utf8_helper("3.3.7 3-byte sequence with last byte missing (U-0000FFFF): \"ï¿\"", MOSQ_ERR_MALFORMED_UTF8);
  156. utf8_helper("3.3.8 4-byte sequence with last byte missing (U-001FFFFF): \"÷¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  157. utf8_helper("3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): \"û¿¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  158. utf8_helper("3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \"ý¿¿¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  159. /* 3.4 Concatenation of incomplete sequences
  160. All the 10 sequences of 3.3 concatenated, you should see 10 malformed
  161. sequences being signalled:*/
  162. utf8_helper("\"Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  163. /* 3.5 Impossible bytes
  164. The following two bytes cannot appear in a correct UTF-8 string */
  165. utf8_helper("3.5.1 fe = \"þ\"", MOSQ_ERR_MALFORMED_UTF8);
  166. utf8_helper("3.5.2 ff = \"ÿ\"", MOSQ_ERR_MALFORMED_UTF8);
  167. utf8_helper("3.5.3 fe fe ff ff = \"þþÿÿ\"", MOSQ_ERR_MALFORMED_UTF8);
  168. }
  169. static void TEST_utf8_overlong_encoding(void)
  170. {
  171. /* 4 Overlong sequences
  172. The following sequences are not malformed according to the letter of
  173. the Unicode 2.0 standard. However, they are longer then necessary and
  174. a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
  175. decoder" should reject them just like malformed sequences for two
  176. reasons: (1) It helps to debug applications if overlong sequences are
  177. not treated as valid representations of characters, because this helps
  178. to spot problems more quickly. (2) Overlong sequences provide
  179. alternative representations of characters, that could maliciously be
  180. used to bypass filters that check only for ASCII characters. For
  181. instance, a 2-byte encoded line feed (LF) would not be caught by a
  182. line counter that counts only 0x0a bytes, but it would still be
  183. processed as a line feed by an unsafe UTF-8 decoder later in the
  184. pipeline. From a security point of view, ASCII compatibility of UTF-8
  185. sequences means also, that ASCII characters are *only* allowed to be
  186. represented by ASCII bytes in the range 0x00-0x7f. To ensure this
  187. aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
  188. reject overlong UTF-8 sequences for which a shorter encoding exists. */
  189. /* 4.1 Examples of an overlong ASCII character
  190. With a safe UTF-8 decoder, all of the following five overlong
  191. representations of the ASCII character slash ("/") should be rejected
  192. like a malformed UTF-8 sequence, for instance by substituting it with
  193. a replacement character. If you see a slash below, you do not have a
  194. safe UTF-8 decoder! */
  195. utf8_helper("4.1.1 U+002F = c0 af = \"À¯\"", MOSQ_ERR_MALFORMED_UTF8);
  196. utf8_helper("4.1.2 U+002F = e0 80 af = \"à€¯\"", MOSQ_ERR_MALFORMED_UTF8);
  197. utf8_helper("4.1.3 U+002F = f0 80 80 af = \"ð€€¯\"", MOSQ_ERR_MALFORMED_UTF8);
  198. utf8_helper("4.1.4 U+002F = f8 80 80 80 af = \"ø€€€¯\"", MOSQ_ERR_MALFORMED_UTF8);
  199. utf8_helper("4.1.5 U+002F = fc 80 80 80 80 af = \"ü€€€€¯\"", MOSQ_ERR_MALFORMED_UTF8);
  200. /* 4.2 Maximum overlong sequences
  201. Below you see the highest Unicode value that is still resulting in an
  202. overlong sequence if represented with the given number of bytes. This
  203. is a boundary test for safe UTF-8 decoders. All five characters should
  204. be rejected like malformed UTF-8 sequences. */
  205. utf8_helper("4.2.1 U-0000007F = c1 bf = \"Á¿\"", MOSQ_ERR_MALFORMED_UTF8);
  206. utf8_helper("4.2.2 U-000007FF = e0 9f bf = \"àŸ¿\"", MOSQ_ERR_MALFORMED_UTF8);
  207. utf8_helper("4.2.3 U-0000FFFF = f0 8f bf bf = \"�\"", MOSQ_ERR_MALFORMED_UTF8);
  208. utf8_helper("4.2.4 U-001FFFFF = f8 87 bf bf bf = \"ø‡¿¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  209. utf8_helper("4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = \"üƒ¿¿¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  210. /* 4.3 Overlong representation of the NUL character
  211. The following five sequences should also be rejected like malformed
  212. UTF-8 sequences and should not be treated like the ASCII NUL
  213. character. */
  214. utf8_helper("4.3.1 U+0000 = c0 80 = \"À€\"", MOSQ_ERR_MALFORMED_UTF8);
  215. utf8_helper("4.3.2 U+0000 = e0 80 80 = \"à€€\"", MOSQ_ERR_MALFORMED_UTF8);
  216. utf8_helper("4.3.3 U+0000 = f0 80 80 80 = \"ð€€€\"", MOSQ_ERR_MALFORMED_UTF8);
  217. utf8_helper("4.3.4 U+0000 = f8 80 80 80 80 = \"ø€€€€\"", MOSQ_ERR_MALFORMED_UTF8);
  218. utf8_helper("4.3.5 U+0000 = fc 80 80 80 80 80 = \"ü€€€€€\"", MOSQ_ERR_MALFORMED_UTF8);
  219. }
  220. static void TEST_utf8_illegal_code_positions(void)
  221. {
  222. /* 5 Illegal code positions
  223. The following UTF-8 sequences should be rejected like malformed
  224. sequences, because they never represent valid ISO 10646 characters and
  225. a UTF-8 decoder that accepts them might introduce security problems
  226. comparable to overlong UTF-8 sequences. */
  227. /* 5.1 Single UTF-16 surrogates */
  228. utf8_helper("5.1.1 U+D800 = ed a0 80 = \"í €\"", MOSQ_ERR_MALFORMED_UTF8);
  229. utf8_helper("5.1.2 U+DB7F = ed ad bf = \"í­¿\"", MOSQ_ERR_MALFORMED_UTF8);
  230. utf8_helper("5.1.3 U+DB80 = ed ae 80 = \"í®€\"", MOSQ_ERR_MALFORMED_UTF8);
  231. utf8_helper("5.1.4 U+DBFF = ed af bf = \"í¯¿\"", MOSQ_ERR_MALFORMED_UTF8);
  232. utf8_helper("5.1.5 U+DC00 = ed b0 80 = \"í°€\"", MOSQ_ERR_MALFORMED_UTF8);
  233. utf8_helper("5.1.6 U+DF80 = ed be 80 = \"í¾€\"", MOSQ_ERR_MALFORMED_UTF8);
  234. utf8_helper("5.1.7 U+DFFF = ed bf bf = \"í¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  235. /* 5.2 Paired UTF-16 surrogates */
  236. utf8_helper("5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = \"𐀀\"", MOSQ_ERR_MALFORMED_UTF8);
  237. utf8_helper("5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = \"𐏿\"", MOSQ_ERR_MALFORMED_UTF8);
  238. utf8_helper("5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = \"í­¿í°€\"", MOSQ_ERR_MALFORMED_UTF8);
  239. utf8_helper("5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = \"í­¿í¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  240. utf8_helper("5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = \"󰀀\"", MOSQ_ERR_MALFORMED_UTF8);
  241. utf8_helper("5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = \"󰏿\"", MOSQ_ERR_MALFORMED_UTF8);
  242. utf8_helper("5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = \"􏰀\"", MOSQ_ERR_MALFORMED_UTF8);
  243. utf8_helper("5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = \"􏿿\"", MOSQ_ERR_MALFORMED_UTF8);
  244. /* 5.3 Noncharacter code positions
  245. The following "noncharacters" are "reserved for internal use" by
  246. applications, and according to older versions of the Unicode Standard
  247. "should never be interchanged". Unicode Corrigendum #9 dropped the
  248. latter restriction. Nevertheless, their presence in incoming UTF-8 data
  249. can remain a potential security risk, depending on what use is made of
  250. these codes subsequently. Examples of such internal use:
  251. - Some file APIs with 16-bit characters may use the integer value -1
  252. = U+FFFF to signal an end-of-file (EOF) or error condition.
  253. - In some UTF-16 receivers, code point U+FFFE might trigger a
  254. byte-swap operation (to convert between UTF-16LE and UTF-16BE).
  255. With such internal use of noncharacters, it may be desirable and safer
  256. to block those code points in UTF-8 decoders, as they should never
  257. occur legitimately in incoming UTF-8 data, and could trigger unsafe
  258. behaviour in subsequent processing.
  259. Particularly problematic noncharacters in 16-bit applications: */
  260. utf8_helper("5.3.1 U+FFFE = ef bf be = \"￾\"", MOSQ_ERR_MALFORMED_UTF8);
  261. utf8_helper("5.3.2 U+FFFF = ef bf bf = \"ï¿¿\"", MOSQ_ERR_MALFORMED_UTF8);
  262. /* Other noncharacters: */
  263. /* FIXME - these need splitting up into separate tests. */
  264. utf8_helper("5.3.3 U+FDD0 .. U+FDEF = \"�﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜�﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯\"", MOSQ_ERR_MALFORMED_UTF8);
  265. utf8_helper("ï·�", MOSQ_ERR_MALFORMED_UTF8);
  266. utf8_helper("ï·‘", MOSQ_ERR_MALFORMED_UTF8);
  267. utf8_helper("ï·’", MOSQ_ERR_MALFORMED_UTF8);
  268. utf8_helper("ï·“", MOSQ_ERR_MALFORMED_UTF8);
  269. utf8_helper("ï·”", MOSQ_ERR_MALFORMED_UTF8);
  270. utf8_helper("ï·•", MOSQ_ERR_MALFORMED_UTF8);
  271. utf8_helper("ï·–", MOSQ_ERR_MALFORMED_UTF8);
  272. utf8_helper("ï·—", MOSQ_ERR_MALFORMED_UTF8);
  273. utf8_helper("ï·˜", MOSQ_ERR_MALFORMED_UTF8);
  274. utf8_helper("ï·™", MOSQ_ERR_MALFORMED_UTF8);
  275. utf8_helper("ï·š", MOSQ_ERR_MALFORMED_UTF8);
  276. utf8_helper("ï·›", MOSQ_ERR_MALFORMED_UTF8);
  277. utf8_helper("﷜", MOSQ_ERR_MALFORMED_UTF8);
  278. utf8_helper("ï·�", MOSQ_ERR_MALFORMED_UTF8);
  279. utf8_helper("ï·ž", MOSQ_ERR_MALFORMED_UTF8);
  280. utf8_helper("ï·Ÿ", MOSQ_ERR_MALFORMED_UTF8);
  281. utf8_helper("ï· ", MOSQ_ERR_MALFORMED_UTF8);
  282. utf8_helper("ï·¡", MOSQ_ERR_MALFORMED_UTF8);
  283. utf8_helper("ï·¢", MOSQ_ERR_MALFORMED_UTF8);
  284. utf8_helper("ï·£", MOSQ_ERR_MALFORMED_UTF8);
  285. utf8_helper("ï·¤", MOSQ_ERR_MALFORMED_UTF8);
  286. utf8_helper("ï·¥", MOSQ_ERR_MALFORMED_UTF8);
  287. utf8_helper("ï·¦", MOSQ_ERR_MALFORMED_UTF8);
  288. utf8_helper("ï·§", MOSQ_ERR_MALFORMED_UTF8);
  289. utf8_helper("ï·¨", MOSQ_ERR_MALFORMED_UTF8);
  290. utf8_helper("ï·©", MOSQ_ERR_MALFORMED_UTF8);
  291. utf8_helper("ï·ª", MOSQ_ERR_MALFORMED_UTF8);
  292. utf8_helper("ï·«", MOSQ_ERR_MALFORMED_UTF8);
  293. utf8_helper("ï·¬", MOSQ_ERR_MALFORMED_UTF8);
  294. utf8_helper("ï·­", MOSQ_ERR_MALFORMED_UTF8);
  295. utf8_helper("ï·®", MOSQ_ERR_MALFORMED_UTF8);
  296. utf8_helper("ï·¯", MOSQ_ERR_MALFORMED_UTF8);
  297. /* 5.3.4 U+nFFFE U+nFFFF (for n = 1..10) */
  298. utf8_helper("🿾", MOSQ_ERR_MALFORMED_UTF8);
  299. utf8_helper("🿿", MOSQ_ERR_MALFORMED_UTF8);
  300. utf8_helper("𯿾", MOSQ_ERR_MALFORMED_UTF8);
  301. utf8_helper("𯿿", MOSQ_ERR_MALFORMED_UTF8);
  302. utf8_helper("ð¿¿¾", MOSQ_ERR_MALFORMED_UTF8);
  303. utf8_helper("ð¿¿¿", MOSQ_ERR_MALFORMED_UTF8);
  304. utf8_helper("�", MOSQ_ERR_MALFORMED_UTF8);
  305. utf8_helper("�", MOSQ_ERR_MALFORMED_UTF8);
  306. utf8_helper("񟿾", MOSQ_ERR_MALFORMED_UTF8);
  307. utf8_helper("񟿿", MOSQ_ERR_MALFORMED_UTF8);
  308. utf8_helper("񯿾", MOSQ_ERR_MALFORMED_UTF8);
  309. utf8_helper("񯿿", MOSQ_ERR_MALFORMED_UTF8);
  310. utf8_helper("ñ¿¿¾", MOSQ_ERR_MALFORMED_UTF8);
  311. utf8_helper("ñ¿¿¿", MOSQ_ERR_MALFORMED_UTF8);
  312. utf8_helper("�", MOSQ_ERR_MALFORMED_UTF8);
  313. utf8_helper("�", MOSQ_ERR_MALFORMED_UTF8);
  314. utf8_helper("򟿾", MOSQ_ERR_MALFORMED_UTF8);
  315. utf8_helper("òŸ¿¿", MOSQ_ERR_MALFORMED_UTF8);
  316. utf8_helper("򯿾", MOSQ_ERR_MALFORMED_UTF8);
  317. utf8_helper("򯿿", MOSQ_ERR_MALFORMED_UTF8);
  318. utf8_helper("ò¿¿¾", MOSQ_ERR_MALFORMED_UTF8);
  319. utf8_helper("ò¿¿¿", MOSQ_ERR_MALFORMED_UTF8);
  320. utf8_helper("�", MOSQ_ERR_MALFORMED_UTF8);
  321. utf8_helper("�", MOSQ_ERR_MALFORMED_UTF8);
  322. utf8_helper("󟿾", MOSQ_ERR_MALFORMED_UTF8);
  323. utf8_helper("óŸ¿¿", MOSQ_ERR_MALFORMED_UTF8);
  324. utf8_helper("󯿾", MOSQ_ERR_MALFORMED_UTF8);
  325. utf8_helper("󯿿", MOSQ_ERR_MALFORMED_UTF8);
  326. utf8_helper("ó¿¿¾", MOSQ_ERR_MALFORMED_UTF8);
  327. utf8_helper("ó¿¿¿", MOSQ_ERR_MALFORMED_UTF8);
  328. utf8_helper("�", MOSQ_ERR_MALFORMED_UTF8);
  329. utf8_helper("�", MOSQ_ERR_MALFORMED_UTF8);
  330. }
  331. void TEST_utf8_control_characters(void)
  332. {
  333. uint8_t buf[10];
  334. int i;
  335. /* U+0001 to U+001F are single byte control characters */
  336. for(i=0x01; i<0x20; i++){
  337. buf[0] = (uint8_t)i;
  338. buf[1] = '\0';
  339. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  340. }
  341. /* U+007F is a single byte control character */
  342. buf[0] = 0x7F;
  343. buf[1] = '\0';
  344. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  345. /* U+007F to U+009F are two byte control characters */
  346. for(i=0x80; i<0xA0; i++){
  347. buf[0] = 0xC2;
  348. buf[1] = (uint8_t)(i-0x80);
  349. buf[2] = '\0';
  350. utf8_helper((char *)buf, MOSQ_ERR_MALFORMED_UTF8);
  351. }
  352. }
  353. void TEST_utf8_mqtt_1_5_4_2(void)
  354. {
  355. uint8_t buf[10] = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', '\0'};
  356. utf8_helper_len((char *)buf, 9, MOSQ_ERR_SUCCESS);
  357. buf[3] = '\0';
  358. utf8_helper_len((char *)buf, 9, MOSQ_ERR_MALFORMED_UTF8);
  359. }
  360. void TEST_utf8_mqtt_1_5_4_3(void)
  361. {
  362. uint8_t buf[10] = {'a', 'b', 0xEF, 0xBB, 0xBF, 'f', 'g', 'h', 'i', '\0'};
  363. utf8_helper_len((char *)buf, 9, MOSQ_ERR_SUCCESS);
  364. }
  365. /* ========================================================================
  366. * TEST SUITE SETUP
  367. * ======================================================================== */
  368. int init_utf8_tests(void)
  369. {
  370. CU_pSuite test_suite = NULL;
  371. test_suite = CU_add_suite("UTF-8", NULL, NULL);
  372. if(!test_suite){
  373. printf("Error adding CUnit test suite.\n");
  374. return 1;
  375. }
  376. if(0
  377. || !CU_add_test(test_suite, "UTF-8 empty", TEST_utf8_empty)
  378. || !CU_add_test(test_suite, "UTF-8 valid", TEST_utf8_valid)
  379. || !CU_add_test(test_suite, "UTF-8 truncated", TEST_utf8_truncated)
  380. || !CU_add_test(test_suite, "UTF-8 boundary conditions", TEST_utf8_boundary_conditions)
  381. || !CU_add_test(test_suite, "UTF-8 malformed sequences", TEST_utf8_malformed_sequences)
  382. || !CU_add_test(test_suite, "UTF-8 overlong encoding", TEST_utf8_overlong_encoding)
  383. || !CU_add_test(test_suite, "UTF-8 illegal code positions", TEST_utf8_illegal_code_positions)
  384. || !CU_add_test(test_suite, "UTF-8 control characters", TEST_utf8_control_characters)
  385. || !CU_add_test(test_suite, "UTF-8 MQTT-1.5.4-2", TEST_utf8_mqtt_1_5_4_2)
  386. || !CU_add_test(test_suite, "UTF-8 MQTT-1.5.4-3", TEST_utf8_mqtt_1_5_4_3)
  387. ){
  388. printf("Error adding UTF-8 CUnit tests.\n");
  389. return 1;
  390. }
  391. return 0;
  392. }