minilex.c 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. /*
  2. * minilex.c
  3. *
  4. * High efficiency lexical state parser
  5. *
  6. * Copyright (C)2011-2014 Andy Green <andy@warmcat.com>
  7. *
  8. * Licensed under LGPL2
  9. *
  10. * Usage: gcc minilex.c -o minilex && ./minilex > lextable.h
  11. *
  12. * Run it twice to test parsing on the generated table on stderr
  13. */
  14. #include <stdio.h>
  15. #include <stdlib.h>
  16. #include <string.h>
  17. #include "lextable-strings.h"
  18. /*
  19. * b7 = 0 = 1-byte seq
  20. * 0x08 = fail
  21. * 2-byte seq
  22. * 0x00 - 0x07, then terminal as given in 2nd byte
  23. 3-byte seq
  24. * no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes
  25. * = 1 = 1-byte seq
  26. * no match: die, match go fwd 1 byte
  27. */
  28. unsigned char lextable[] = {
  29. #include "lextable.h"
  30. };
  31. #define PARALLEL 30
  32. struct state {
  33. char c[PARALLEL];
  34. int state[PARALLEL];
  35. int count;
  36. int bytepos;
  37. int real_pos;
  38. };
  39. struct state state[1000];
  40. int next = 1;
  41. #define FAIL_CHAR 0x08
  42. int lextable_decode(int pos, char c)
  43. {
  44. while (1) {
  45. if (lextable[pos] & (1 << 7)) { /* 1-byte, fail on mismatch */
  46. if ((lextable[pos] & 0x7f) != c)
  47. return -1;
  48. /* fall thru */
  49. pos++;
  50. if (lextable[pos] == FAIL_CHAR)
  51. return -1;
  52. return pos;
  53. } else { /* b7 = 0, end or 3-byte */
  54. if (lextable[pos] < FAIL_CHAR) /* terminal marker */
  55. return pos;
  56. if (lextable[pos] == c) /* goto */
  57. return pos + (lextable[pos + 1]) +
  58. (lextable[pos + 2] << 8);
  59. /* fall thru goto */
  60. pos += 3;
  61. /* continue */
  62. }
  63. }
  64. }
  65. int main(void)
  66. {
  67. int n = 0;
  68. int m = 0;
  69. int prev;
  70. char c;
  71. int walk;
  72. int saw;
  73. int y;
  74. int j;
  75. int pos = 0;
  76. while (n < sizeof(set) / sizeof(set[0])) {
  77. m = 0;
  78. walk = 0;
  79. prev = 0;
  80. if (set[n][0] == '\0') {
  81. n++;
  82. continue;
  83. }
  84. while (set[n][m]) {
  85. saw = 0;
  86. for (y = 0; y < state[walk].count; y++)
  87. if (state[walk].c[y] == set[n][m]) {
  88. /* exists -- go forward */
  89. walk = state[walk].state[y];
  90. saw = 1;
  91. break;
  92. }
  93. if (saw)
  94. goto again;
  95. /* something we didn't see before */
  96. state[walk].c[state[walk].count] = set[n][m];
  97. state[walk].state[state[walk].count] = next;
  98. state[walk].count++;
  99. walk = next++;
  100. again:
  101. m++;
  102. }
  103. state[walk].c[0] = n++;
  104. state[walk].state[0] = 0; /* terminal marker */
  105. state[walk].count = 1;
  106. }
  107. walk = 0;
  108. for (n = 0; n < next; n++) {
  109. state[n].bytepos = walk;
  110. walk += (2 * state[n].count);
  111. }
  112. /* compute everyone's position first */
  113. pos = 0;
  114. walk = 0;
  115. for (n = 0; n < next; n++) {
  116. state[n].real_pos = pos;
  117. for (m = 0; m < state[n].count; m++) {
  118. if (state[n].state[m] == 0)
  119. pos += 2; /* terminal marker */
  120. else { /* c is a character */
  121. if ((state[state[n].state[m]].bytepos -
  122. walk) == 2)
  123. pos++;
  124. else {
  125. pos += 3;
  126. if (m == state[n].count - 1)
  127. pos++; /* fail */
  128. }
  129. }
  130. walk += 2;
  131. }
  132. }
  133. walk = 0;
  134. pos = 0;
  135. for (n = 0; n < next; n++) {
  136. for (m = 0; m < state[n].count; m++) {
  137. if (!m)
  138. fprintf(stdout, "/* pos %04x: %3d */ ",
  139. state[n].real_pos, n);
  140. else
  141. fprintf(stdout, " ");
  142. y = state[n].c[m];
  143. saw = state[n].state[m];
  144. if (saw == 0) { // c is a terminal then
  145. if (y > 0x7ff) {
  146. fprintf(stderr, "terminal too big\n");
  147. return 2;
  148. }
  149. fprintf(stdout, " 0x%02X, 0x%02X "
  150. " "
  151. "/* - terminal marker %2d - */,\n",
  152. y >> 8, y & 0xff, y & 0x7f);
  153. pos += 2;
  154. walk += 2;
  155. continue;
  156. }
  157. /* c is a character */
  158. prev = y &0x7f;
  159. if (prev < 32 || prev > 126)
  160. prev = '.';
  161. if ((state[saw].bytepos - walk) == 2) {
  162. fprintf(stdout, " 0x%02X /* '%c' -> */,\n",
  163. y | 0x80, prev);
  164. pos++;
  165. walk += 2;
  166. continue;
  167. }
  168. j = state[saw].real_pos - pos;
  169. if (j > 0xffff) {
  170. fprintf(stderr,
  171. "Jump > 64K bytes ahead (%d to %d)\n",
  172. state[n].real_pos, state[saw].real_pos);
  173. return 1;
  174. }
  175. fprintf(stdout, " 0x%02X /* '%c' */, 0x%02X, 0x%02X "
  176. "/* (to 0x%04X state %3d) */,\n",
  177. y, prev,
  178. j & 0xff, j >> 8,
  179. state[saw].real_pos, saw);
  180. pos += 3;
  181. if (m == state[n].count - 1) {
  182. fprintf(stdout,
  183. " 0x%02X, /* fail */\n",
  184. FAIL_CHAR);
  185. pos++; /* fail */
  186. }
  187. walk += 2;
  188. }
  189. }
  190. fprintf(stdout, "/* total size %d bytes */\n", pos);
  191. /*
  192. * Try to parse every legal input string
  193. */
  194. for (n = 0; n < sizeof(set) / sizeof(set[0]); n++) {
  195. walk = 0;
  196. m = 0;
  197. y = -1;
  198. if (set[n][0] == '\0')
  199. continue;
  200. fprintf(stderr, " trying '%s'\n", set[n]);
  201. while (set[n][m]) {
  202. walk = lextable_decode(walk, set[n][m]);
  203. if (walk < 0) {
  204. fprintf(stderr, "failed\n");
  205. return 3;
  206. }
  207. if (lextable[walk] < FAIL_CHAR) {
  208. y = (lextable[walk] << 8) + lextable[walk + 1];
  209. break;
  210. }
  211. m++;
  212. }
  213. if (y != n) {
  214. fprintf(stderr, "decode failed %d\n", y);
  215. return 4;
  216. }
  217. }
  218. fprintf(stderr, "All decode OK\n");
  219. return 0;
  220. }