minihuf.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. /*
  2. * minilex.c
  3. *
  4. * High efficiency lexical state parser
  5. *
  6. * Copyright (C)2011-2014 Andy Green <andy@warmcat.com>
  7. *
  8. * Licensed under LGPL2
  9. *
  10. * Usage: gcc minihuf.c -o minihuf && ./minihuf > huftable.h
  11. *
  12. * Run it twice to test parsing on the generated table on stderr
  13. */
  14. #include <stdio.h>
  15. #include <stdlib.h>
  16. #include <string.h>
  17. #define ARRAY_SIZE(n) (sizeof(n) / sizeof(n[0]))
  18. struct huf {
  19. unsigned int code;
  20. unsigned char len;
  21. };
  22. static struct huf huf_literal[] = {
  23. /* 0x00 */ { 0x1ff8, 13 },
  24. /* 0x01 */ { 0x7fffd8, 23 },
  25. /* 0x02 */ { 0xfffffe2, 28 },
  26. /* 0x03 */ { 0xfffffe3, 28 },
  27. /* 0x04 */ { 0xfffffe4, 28 },
  28. /* 0x05 */ { 0xfffffe5, 28 },
  29. /* 0x06 */ { 0xfffffe6, 28 },
  30. /* 0x07 */ { 0xfffffe7, 28 },
  31. /* 0x08 */ { 0xfffffe8, 28 },
  32. /* 0x09 */ { 0xffffea, 24 },
  33. /* 0x0a */ { 0x3ffffffc, 30 },
  34. /* 0x0b */ { 0xfffffe9, 28 },
  35. /* 0x0c */ { 0xfffffea, 28 },
  36. /* 0x0d */ { 0x3ffffffd, 30 },
  37. /* 0x0e */ { 0xfffffeb, 28 },
  38. /* 0x0f */ { 0xfffffec, 28 },
  39. /* 0x10 */ { 0xfffffed, 28 },
  40. /* 0x11 */ { 0xfffffee, 28 },
  41. /* 0x12 */ { 0xfffffef, 28 },
  42. /* 0x13 */ { 0xffffff0, 28 },
  43. /* 0x14 */ { 0xffffff1, 28 },
  44. /* 0x15 */ { 0xffffff2, 28 },
  45. /* 0x16 */ { 0x3ffffffe, 30 },
  46. /* 0x17 */ { 0xffffff3, 28 },
  47. /* 0x18 */ { 0xffffff4, 28 },
  48. /* 0x19 */ { 0xffffff5, 28 },
  49. /* 0x1a */ { 0xffffff6, 28 },
  50. /* 0x1b */ { 0xffffff7, 28 },
  51. /* 0x1c */ { 0xffffff8, 28 },
  52. /* 0x1d */ { 0xffffff9, 28 },
  53. /* 0x1e */ { 0xffffffa, 28 },
  54. /* 0x1f */ { 0xffffffb, 28 },
  55. /* 0x20 */ { 0x14, 6 },
  56. /* 0x21 */ { 0x3f8, 10 },
  57. /* 0x22 */ { 0x3f9, 10 },
  58. /* 0x23 */ { 0xffa, 12 },
  59. /* 0x24 */ { 0x1ff9, 13 },
  60. /* 0x25 */ { 0x15, 6 },
  61. /* 0x26 */ { 0xf8, 8 },
  62. /* 0x27 */ { 0x7fa, 11 },
  63. /* 0x28 */ { 0x3fa, 10 },
  64. /* 0x29 */ { 0x3fb, 10 },
  65. /* 0x2a */ { 0xf9, 8 },
  66. /* 0x2b */ { 0x7fb, 11 },
  67. /* 0x2c */ { 0xfa, 8 },
  68. /* 0x2d */ { 0x16, 6 },
  69. /* 0x2e */ { 0x17, 6 },
  70. /* 0x2f */ { 0x18, 6 },
  71. /* 0x30 */ { 0x0, 5 },
  72. /* 0x31 */ { 0x1, 5 },
  73. /* 0x32 */ { 0x2, 5 },
  74. /* 0x33 */ { 0x19, 6 },
  75. /* 0x34 */ { 0x1a, 6 },
  76. /* 0x35 */ { 0x1b, 6 },
  77. /* 0x36 */ { 0x1c, 6 },
  78. /* 0x37 */ { 0x1d, 6 },
  79. /* 0x38 */ { 0x1e, 6 },
  80. /* 0x39 */ { 0x1f, 6 },
  81. /* 0x3a */ { 0x5c, 7 },
  82. /* 0x3b */ { 0xfb, 8 },
  83. /* 0x3c */ { 0x7ffc, 15 },
  84. /* 0x3d */ { 0x20, 6 },
  85. /* 0x3e */ { 0xffb, 12 },
  86. /* 0x3f */ { 0x3fc, 10 },
  87. /* 0x40 */ { 0x1ffa, 13 },
  88. /* 0x41 */ { 0x21, 6 },
  89. /* 0x42 */ { 0x5d, 7 },
  90. /* 0x43 */ { 0x5e, 7 },
  91. /* 0x44 */ { 0x5f, 7 },
  92. /* 0x45 */ { 0x60, 7 },
  93. /* 0x46 */ { 0x61, 7 },
  94. /* 0x47 */ { 0x62, 7 },
  95. /* 0x48 */ { 0x63, 7 },
  96. /* 0x49 */ { 0x64, 7 },
  97. /* 0x4a */ { 0x65, 7 },
  98. /* 0x4b */ { 0x66, 7 },
  99. /* 0x4c */ { 0x67, 7 },
  100. /* 0x4d */ { 0x68, 7 },
  101. /* 0x4e */ { 0x69, 7 },
  102. /* 0x4f */ { 0x6a, 7 },
  103. /* 0x50 */ { 0x6b, 7 },
  104. /* 0x51 */ { 0x6c, 7 },
  105. /* 0x52 */ { 0x6d, 7 },
  106. /* 0x53 */ { 0x6e, 7 },
  107. /* 0x54 */ { 0x6f, 7 },
  108. /* 0x55 */ { 0x70, 7 },
  109. /* 0x56 */ { 0x71, 7 },
  110. /* 0x57 */ { 0x72, 7 },
  111. /* 0x58 */ { 0xfc, 8 },
  112. /* 0x59 */ { 0x73, 7 },
  113. /* 0x5a */ { 0xfd, 8 },
  114. /* 0x5b */ { 0x1ffb, 13 },
  115. /* 0x5c */ { 0x7fff0, 19 },
  116. /* 0x5d */ { 0x1ffc, 13 },
  117. /* 0x5e */ { 0x3ffc, 14 },
  118. /* 0x5f */ { 0x22, 6 },
  119. /* 0x60 */ { 0x7ffd, 15 },
  120. /* 0x61 */ { 0x3, 5 },
  121. /* 0x62 */ { 0x23, 6 },
  122. /* 0x63 */ { 0x4, 5 },
  123. /* 0x64 */ { 0x24, 6 },
  124. /* 0x65 */ { 0x5, 5 },
  125. /* 0x66 */ { 0x25, 6 },
  126. /* 0x67 */ { 0x26, 6 },
  127. /* 0x68 */ { 0x27, 6 },
  128. /* 0x69 */ { 0x6, 5 },
  129. /* 0x6a */ { 0x74, 7 },
  130. /* 0x6b */ { 0x75, 7 },
  131. /* 0x6c */ { 0x28, 6 },
  132. /* 0x6d */ { 0x29, 6 },
  133. /* 0x6e */ { 0x2a, 6 },
  134. /* 0x6f */ { 0x7, 5 },
  135. /* 0x70 */ { 0x2b, 6 },
  136. /* 0x71 */ { 0x76, 7 },
  137. /* 0x72 */ { 0x2c, 6 },
  138. /* 0x73 */ { 0x8, 5 },
  139. /* 0x74 */ { 0x9, 5 },
  140. /* 0x75 */ { 0x2d, 6 },
  141. /* 0x76 */ { 0x77, 7 },
  142. /* 0x77 */ { 0x78, 7 },
  143. /* 0x78 */ { 0x79, 7 },
  144. /* 0x79 */ { 0x7a, 7 },
  145. /* 0x7a */ { 0x7b, 7 },
  146. /* 0x7b */ { 0x7ffe, 15 },
  147. /* 0x7c */ { 0x7fc, 11 },
  148. /* 0x7d */ { 0x3ffd, 14 },
  149. /* 0x7e */ { 0x1ffd, 13 },
  150. /* 0x7f */ { 0xffffffc, 28 },
  151. /* 0x80 */ { 0xfffe6, 20 },
  152. /* 0x81 */ { 0x3fffd2, 22 },
  153. /* 0x82 */ { 0xfffe7, 20 },
  154. /* 0x83 */ { 0xfffe8, 20 },
  155. /* 0x84 */ { 0x3fffd3, 22 },
  156. /* 0x85 */ { 0x3fffd4, 22 },
  157. /* 0x86 */ { 0x3fffd5, 22 },
  158. /* 0x87 */ { 0x7fffd9, 23 },
  159. /* 0x88 */ { 0x3fffd6, 22 },
  160. /* 0x89 */ { 0x7fffda, 23 },
  161. /* 0x8a */ { 0x7fffdb, 23 },
  162. /* 0x8b */ { 0x7fffdc, 23 },
  163. /* 0x8c */ { 0x7fffdd, 23 },
  164. /* 0x8d */ { 0x7fffde, 23 },
  165. /* 0x8e */ { 0xffffeb, 24 },
  166. /* 0x8f */ { 0x7fffdf, 23 },
  167. /* 0x90 */ { 0xffffec, 24 },
  168. /* 0x91 */ { 0xffffed, 24 },
  169. /* 0x92 */ { 0x3fffd7, 22 },
  170. /* 0x93 */ { 0x7fffe0, 23 },
  171. /* 0x94 */ { 0xffffee, 24 },
  172. /* 0x95 */ { 0x7fffe1, 23 },
  173. /* 0x96 */ { 0x7fffe2, 23 },
  174. /* 0x97 */ { 0x7fffe3, 23 },
  175. /* 0x98 */ { 0x7fffe4, 23 },
  176. /* 0x99 */ { 0x1fffdc, 21 },
  177. /* 0x9a */ { 0x3fffd8, 22 },
  178. /* 0x9b */ { 0x7fffe5, 23 },
  179. /* 0x9c */ { 0x3fffd9, 22 },
  180. /* 0x9d */ { 0x7fffe6, 23 },
  181. /* 0x9e */ { 0x7fffe7, 23 },
  182. /* 0x9f */ { 0xffffef, 24 },
  183. /* 0xa0 */ { 0x3fffda, 22 },
  184. /* 0xa1 */ { 0x1fffdd, 21 },
  185. /* 0xa2 */ { 0xfffe9, 20 },
  186. /* 0xa3 */ { 0x3fffdb, 22 },
  187. /* 0xa4 */ { 0x3fffdc, 22 },
  188. /* 0xa5 */ { 0x7fffe8, 23 },
  189. /* 0xa6 */ { 0x7fffe9, 23 },
  190. /* 0xa7 */ { 0x1fffde, 21 },
  191. /* 0xa8 */ { 0x7fffea, 23 },
  192. /* 0xa9 */ { 0x3fffdd, 22 },
  193. /* 0xaa */ { 0x3fffde, 22 },
  194. /* 0xab */ { 0xfffff0, 24 },
  195. /* 0xac */ { 0x1fffdf, 21 },
  196. /* 0xad */ { 0x3fffdf, 22 },
  197. /* 0xae */ { 0x7fffeb, 23 },
  198. /* 0xaf */ { 0x7fffec, 23 },
  199. /* 0xb0 */ { 0x1fffe0, 21 },
  200. /* 0xb1 */ { 0x1fffe1, 21 },
  201. /* 0xb2 */ { 0x3fffe0, 22 },
  202. /* 0xb3 */ { 0x1fffe2, 21 },
  203. /* 0xb4 */ { 0x7fffed, 23 },
  204. /* 0xb5 */ { 0x3fffe1, 22 },
  205. /* 0xb6 */ { 0x7fffee, 23 },
  206. /* 0xb7 */ { 0x7fffef, 23 },
  207. /* 0xb8 */ { 0xfffea, 20 },
  208. /* 0xb9 */ { 0x3fffe2, 22 },
  209. /* 0xba */ { 0x3fffe3, 22 },
  210. /* 0xbb */ { 0x3fffe4, 22 },
  211. /* 0xbc */ { 0x7ffff0, 23 },
  212. /* 0xbd */ { 0x3fffe5, 22 },
  213. /* 0xbe */ { 0x3fffe6, 22 },
  214. /* 0xbf */ { 0x7ffff1, 23 },
  215. /* 0xc0 */ { 0x3ffffe0, 26 },
  216. /* 0xc1 */ { 0x3ffffe1, 26 },
  217. /* 0xc2 */ { 0xfffeb, 20 },
  218. /* 0xc3 */ { 0x7fff1, 19 },
  219. /* 0xc4 */ { 0x3fffe7, 22 },
  220. /* 0xc5 */ { 0x7ffff2, 23 },
  221. /* 0xc6 */ { 0x3fffe8, 22 },
  222. /* 0xc7 */ { 0x1ffffec, 25 },
  223. /* 0xc8 */ { 0x3ffffe2, 26 },
  224. /* 0xc9 */ { 0x3ffffe3, 26 },
  225. /* 0xca */ { 0x3ffffe4, 26 },
  226. /* 0xcb */ { 0x7ffffde, 27 },
  227. /* 0xcc */ { 0x7ffffdf, 27 },
  228. /* 0xcd */ { 0x3ffffe5, 26 },
  229. /* 0xce */ { 0xfffff1, 24 },
  230. /* 0xcf */ { 0x1ffffed, 25 },
  231. /* 0xd0 */ { 0x7fff2, 19 },
  232. /* 0xd1 */ { 0x1fffe3, 21 },
  233. /* 0xd2 */ { 0x3ffffe6, 26 },
  234. /* 0xd3 */ { 0x7ffffe0, 27 },
  235. /* 0xd4 */ { 0x7ffffe1, 27 },
  236. /* 0xd5 */ { 0x3ffffe7, 26 },
  237. /* 0xd6 */ { 0x7ffffe2, 27 },
  238. /* 0xd7 */ { 0xfffff2, 24 },
  239. /* 0xd8 */ { 0x1fffe4, 21 },
  240. /* 0xd9 */ { 0x1fffe5, 21 },
  241. /* 0xda */ { 0x3ffffe8, 26 },
  242. /* 0xdb */ { 0x3ffffe9, 26 },
  243. /* 0xdc */ { 0xffffffd, 28 },
  244. /* 0xdd */ { 0x7ffffe3, 27 },
  245. /* 0xde */ { 0x7ffffe4, 27 },
  246. /* 0xdf */ { 0x7ffffe5, 27 },
  247. /* 0xe0 */ { 0xfffec, 20 },
  248. /* 0xe1 */ { 0xfffff3, 24 },
  249. /* 0xe2 */ { 0xfffed, 20 },
  250. /* 0xe3 */ { 0x1fffe6, 21 },
  251. /* 0xe4 */ { 0x3fffe9, 22 },
  252. /* 0xe5 */ { 0x1fffe7, 21 },
  253. /* 0xe6 */ { 0x1fffe8, 21 },
  254. /* 0xe7 */ { 0x7ffff3, 23 },
  255. /* 0xe8 */ { 0x3fffea, 22 },
  256. /* 0xe9 */ { 0x3fffeb, 22 },
  257. /* 0xea */ { 0x1ffffee, 25 },
  258. /* 0xeb */ { 0x1ffffef, 25 },
  259. /* 0xec */ { 0xfffff4, 24 },
  260. /* 0xed */ { 0xfffff5, 24 },
  261. /* 0xee */ { 0x3ffffea, 26 },
  262. /* 0xef */ { 0x7ffff4, 23 },
  263. /* 0xf0 */ { 0x3ffffeb, 26 },
  264. /* 0xf1 */ { 0x7ffffe6, 27 },
  265. /* 0xf2 */ { 0x3ffffec, 26 },
  266. /* 0xf3 */ { 0x3ffffed, 26 },
  267. /* 0xf4 */ { 0x7ffffe7, 27 },
  268. /* 0xf5 */ { 0x7ffffe8, 27 },
  269. /* 0xf6 */ { 0x7ffffe9, 27 },
  270. /* 0xf7 */ { 0x7ffffea, 27 },
  271. /* 0xf8 */ { 0x7ffffeb, 27 },
  272. /* 0xf9 */ { 0xffffffe, 28 },
  273. /* 0xfa */ { 0x7ffffec, 27 },
  274. /* 0xfb */ { 0x7ffffed, 27 },
  275. /* 0xfc */ { 0x7ffffee, 27 },
  276. /* 0xfd */ { 0x7ffffef, 27 },
  277. /* 0xfe */ { 0x7fffff0, 27 },
  278. /* 0xff */ { 0x3ffffee, 26 },
  279. /* 0x100 */ { 0x3fffffff, 30 },
  280. };
  281. int code_bit(int idx, int bit)
  282. {
  283. if (bit < huf_literal[idx].len)
  284. return !!(huf_literal[idx].code & (1 << (huf_literal[idx].len - 1 - bit)));
  285. return -1;
  286. }
  287. #include "huftable.h"
  288. #define PARALLEL 2
  289. struct state {
  290. int terminal;
  291. int state[PARALLEL];
  292. int bytepos;
  293. int real_pos;
  294. };
  295. struct state state[2000];
  296. unsigned char terms[2000];
  297. int next = 1;
  298. int lextable_decode(int pos, char c)
  299. {
  300. int q = pos + !!c;
  301. if (lextable_terms[q >> 3] & (1 << (q & 7))) /* terminal */
  302. return lextable[q] | 0x8000;
  303. return pos + (lextable[q] << 1);
  304. }
  305. int main(void)
  306. {
  307. int n = 0;
  308. int m = 0;
  309. int prev;
  310. char c;
  311. int walk;
  312. int saw;
  313. int y;
  314. int j;
  315. int q;
  316. int pos = 0;
  317. int biggest = 0;
  318. int fails = 0;
  319. m = 0;
  320. while (m < ARRAY_SIZE(state)) {
  321. for (j = 0; j < PARALLEL; j++) {
  322. state[m].state[j] = 0xffff;
  323. state[m].terminal = 0;
  324. }
  325. m++;
  326. }
  327. while (n < ARRAY_SIZE(huf_literal)) {
  328. m = 0;
  329. walk = 0;
  330. prev = 0;
  331. while (m < huf_literal[n].len) {
  332. saw = 0;
  333. if (state[walk].state[code_bit(n, m)] != 0xffff) {
  334. /* exists -- go forward */
  335. walk = state[walk].state[code_bit(n, m)];
  336. goto again;
  337. }
  338. /* something we didn't see before */
  339. state[walk].state[code_bit(n, m)] = next;
  340. walk = next++;
  341. again:
  342. m++;
  343. }
  344. state[walk].terminal = n++;
  345. state[walk].state[0] = 0; /* terminal marker */
  346. }
  347. walk = 0;
  348. for (n = 0; n < next; n++) {
  349. state[n].bytepos = walk;
  350. walk += (2 * 2);
  351. }
  352. /* compute everyone's position first */
  353. pos = 0;
  354. walk = 0;
  355. for (n = 0; n < next; n++) {
  356. state[n].real_pos = pos;
  357. if (state[n].state[0]) /* nonterminal */
  358. pos += 2;
  359. walk ++;
  360. }
  361. fprintf(stdout, "static unsigned char lextable[] = {\n");
  362. #define TERMINAL_MASK 0x8000
  363. walk = 0;
  364. pos = 0;
  365. q = 0;
  366. for (n = 0; n < next; n++) {
  367. q = pos;
  368. for (m = 0; m < 2; m++) {
  369. saw = state[n].state[m];
  370. if (saw == 0) { // c is a terminal then
  371. m = 2;
  372. continue;
  373. }
  374. if (!m)
  375. fprintf(stdout, "/* pos %04x: %3d */ ",
  376. state[n].real_pos, n);
  377. else
  378. fprintf(stdout, " ");
  379. if (saw == 0xffff) {
  380. fprintf(stdout,
  381. " 0xff, 0xff, /* 0 = fail */\n ");
  382. pos ++; /* fail */
  383. fails++;
  384. continue;
  385. }
  386. if (state[saw].state[0] == 0) { /* points to terminal */
  387. fprintf(stdout, " /* terminal %d */ 0x%02X,\n",
  388. state[saw].terminal,
  389. state[saw].terminal & 0xff);
  390. terms[(state[n].real_pos + m) >> 3] |=
  391. 1 << ((state[n].real_pos + m) & 7);
  392. pos++;
  393. walk++;
  394. continue;
  395. }
  396. j = (state[saw].real_pos - q) >> 1;
  397. if (j > biggest)
  398. biggest = j;
  399. if (j > 0xffff) {
  400. fprintf(stderr,
  401. "Jump > 64K bytes ahead (%d to %d)\n",
  402. state[n].real_pos, state[saw].real_pos);
  403. return 1;
  404. }
  405. fprintf(stdout, " /* %d */ 0x%02X "
  406. "/* (to 0x%04X state %3d) */,\n",
  407. m,
  408. j & 0xff,
  409. state[saw].real_pos, saw);
  410. pos++;
  411. walk++;
  412. }
  413. }
  414. fprintf(stdout, "/* total size %d bytes, biggest jump %d/256, fails=%d */\n};\n"
  415. "\n static unsigned char lextable_terms[] = {\n",
  416. pos, biggest, fails);
  417. for (n = 0; n < (walk + 7) / 8; n++) {
  418. if (!(n & 7))
  419. fprintf(stdout, "\n\t");
  420. fprintf(stdout, "0x%02x, ", terms[n]);
  421. }
  422. fprintf(stdout, "\n};\n");
  423. /*
  424. * Try to parse every legal input string
  425. */
  426. for (n = 0; n < ARRAY_SIZE(huf_literal); n++) {
  427. walk = 0;
  428. m = 0;
  429. y = -1;
  430. fprintf(stderr, " trying %d\n", n);
  431. while (m < huf_literal[n].len) {
  432. prev = walk;
  433. walk = lextable_decode(walk, code_bit(n, m));
  434. if (walk == 0xffff) {
  435. fprintf(stderr, "failed\n");
  436. return 3;
  437. }
  438. if (walk & 0x8000) {
  439. y = walk & 0x7fff;
  440. if (y == 0 && m == 29) {
  441. y |= 0x100;
  442. fprintf(stdout,
  443. "\n/* state that points to "
  444. "0x100 for disambiguation with "
  445. "0x0 */\n"
  446. "#define HUFTABLE_0x100_PREV "
  447. "%d\n", prev);
  448. }
  449. break;
  450. }
  451. m++;
  452. }
  453. if (y != n) {
  454. fprintf(stderr, "decode failed %d got %d (0x%x)\n", n, y, y);
  455. return 4;
  456. }
  457. }
  458. fprintf(stderr, "All decode OK\n");
  459. return 0;
  460. }