123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724 |
- #ifdef SLRE_TEST
- #include <stdio.h>
- #include <assert.h>
- #include <ctype.h>
- #include <stdlib.h>
- #include <string.h>
- #else
- #include <common.h>
- #include <linux/ctype.h>
- #endif
- #include <errno.h>
- #include <slre.h>
- enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
- STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
- #ifdef SLRE_TEST
- static struct {
- const char *name;
- int narg;
- const char *flags;
- } opcodes[] = {
- {"END", 0, ""},
- {"BRANCH", 2, "oo"},
- {"ANY", 0, ""},
- {"EXACT", 2, "d"},
- {"ANYOF", 2, "D"},
- {"ANYBUT", 2, "D"},
- {"OPEN ", 1, "i"},
- {"CLOSE", 1, "i"},
- {"BOL", 0, ""},
- {"EOL", 0, ""},
- {"STAR", 1, "o"},
- {"PLUS", 1, "o"},
- {"STARQ", 1, "o"},
- {"PLUSQ", 1, "o"},
- {"QUEST", 1, "o"},
- {"SPACE", 0, ""},
- {"NONSPACE", 0, ""},
- {"DIGIT", 0, ""}
- };
- #endif
- static const char *meta_chars = "|.^$*+?()[\\";
- #ifdef SLRE_TEST
- static void
- print_character_set(FILE *fp, const unsigned char *p, int len)
- {
- int i;
- for (i = 0; i < len; i++) {
- if (i > 0)
- (void) fputc(',', fp);
- if (p[i] == 0) {
- i++;
- if (p[i] == 0)
- (void) fprintf(fp, "\\x%02x", p[i]);
- else
- (void) fprintf(fp, "%s", opcodes[p[i]].name);
- } else if (isprint(p[i])) {
- (void) fputc(p[i], fp);
- } else {
- (void) fprintf(fp, "\\x%02x", p[i]);
- }
- }
- }
- void
- slre_dump(const struct slre *r, FILE *fp)
- {
- int i, j, ch, op, pc;
- for (pc = 0; pc < r->code_size; pc++) {
- op = r->code[pc];
- (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
- for (i = 0; opcodes[op].flags[i] != '\0'; i++)
- switch (opcodes[op].flags[i]) {
- case 'i':
- (void) fprintf(fp, "%d ", r->code[pc + 1]);
- pc++;
- break;
- case 'o':
- (void) fprintf(fp, "%d ",
- pc + r->code[pc + 1] - i);
- pc++;
- break;
- case 'D':
- print_character_set(fp, r->data +
- r->code[pc + 1], r->code[pc + 2]);
- pc += 2;
- break;
- case 'd':
- (void) fputc('"', fp);
- for (j = 0; j < r->code[pc + 2]; j++) {
- ch = r->data[r->code[pc + 1] + j];
- if (isprint(ch)) {
- (void) fputc(ch, fp);
- } else {
- (void) fprintf(fp,
- "\\x%02x", ch);
- }
- }
- (void) fputc('"', fp);
- pc += 2;
- break;
- }
- (void) fputc('\n', fp);
- }
- }
- #endif
- static void
- set_jump_offset(struct slre *r, int pc, int offset)
- {
- assert(offset < r->code_size);
- if (r->code_size - offset > 0xff)
- r->err_str = "Jump offset is too big";
- else
- r->code[pc] = (unsigned char) (r->code_size - offset);
- }
- static void
- emit(struct slre *r, int code)
- {
- if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
- r->err_str = "RE is too long (code overflow)";
- else
- r->code[r->code_size++] = (unsigned char) code;
- }
- static void
- store_char_in_data(struct slre *r, int ch)
- {
- if (r->data_size >= (int) sizeof(r->data))
- r->err_str = "RE is too long (data overflow)";
- else
- r->data[r->data_size++] = ch;
- }
- static void
- exact(struct slre *r, const char **re)
- {
- int old_data_size = r->data_size;
- while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
- store_char_in_data(r, *(*re)++);
- emit(r, EXACT);
- emit(r, old_data_size);
- emit(r, r->data_size - old_data_size);
- }
- static int
- get_escape_char(const char **re)
- {
- int res;
- switch (*(*re)++) {
- case 'n':
- res = '\n';
- break;
- case 'r':
- res = '\r';
- break;
- case 't':
- res = '\t';
- break;
- case '0':
- res = 0;
- break;
- case 'S':
- res = NONSPACE << 8;
- break;
- case 's':
- res = SPACE << 8;
- break;
- case 'd':
- res = DIGIT << 8;
- break;
- default:
- res = (*re)[-1];
- break;
- }
- return res;
- }
- static void
- anyof(struct slre *r, const char **re)
- {
- int esc, old_data_size = r->data_size, op = ANYOF;
- if (**re == '^') {
- op = ANYBUT;
- (*re)++;
- }
- while (**re != '\0')
- switch (*(*re)++) {
- case ']':
- emit(r, op);
- emit(r, old_data_size);
- emit(r, r->data_size - old_data_size);
- return;
-
- break;
- case '\\':
- esc = get_escape_char(re);
- if ((esc & 0xff) == 0) {
- store_char_in_data(r, 0);
- store_char_in_data(r, esc >> 8);
- } else {
- store_char_in_data(r, esc);
- }
- break;
- default:
- store_char_in_data(r, (*re)[-1]);
- break;
- }
- r->err_str = "No closing ']' bracket";
- }
- static void
- relocate(struct slre *r, int begin, int shift)
- {
- emit(r, END);
- memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
- r->code_size += shift;
- }
- static void
- quantifier(struct slre *r, int prev, int op)
- {
- if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
- r->code[prev + 2]--;
- emit(r, EXACT);
- emit(r, r->code[prev + 1] + r->code[prev + 2]);
- emit(r, 1);
- prev = r->code_size - 3;
- }
- relocate(r, prev, 2);
- r->code[prev] = op;
- set_jump_offset(r, prev + 1, prev);
- }
- static void
- exact_one_char(struct slre *r, int ch)
- {
- emit(r, EXACT);
- emit(r, r->data_size);
- emit(r, 1);
- store_char_in_data(r, ch);
- }
- static void
- fixup_branch(struct slre *r, int fixup)
- {
- if (fixup > 0) {
- emit(r, END);
- set_jump_offset(r, fixup, fixup - 2);
- }
- }
- static void
- compile(struct slre *r, const char **re)
- {
- int op, esc, branch_start, last_op, fixup, cap_no, level;
- fixup = 0;
- level = r->num_caps;
- branch_start = last_op = r->code_size;
- for (;;)
- switch (*(*re)++) {
- case '\0':
- (*re)--;
- return;
-
- break;
- case '^':
- emit(r, BOL);
- break;
- case '$':
- emit(r, EOL);
- break;
- case '.':
- last_op = r->code_size;
- emit(r, ANY);
- break;
- case '[':
- last_op = r->code_size;
- anyof(r, re);
- break;
- case '\\':
- last_op = r->code_size;
- esc = get_escape_char(re);
- if (esc & 0xff00)
- emit(r, esc >> 8);
- else
- exact_one_char(r, esc);
- break;
- case '(':
- last_op = r->code_size;
- cap_no = ++r->num_caps;
- emit(r, OPEN);
- emit(r, cap_no);
- compile(r, re);
- if (*(*re)++ != ')') {
- r->err_str = "No closing bracket";
- return;
- }
- emit(r, CLOSE);
- emit(r, cap_no);
- break;
- case ')':
- (*re)--;
- fixup_branch(r, fixup);
- if (level == 0) {
- r->err_str = "Unbalanced brackets";
- return;
- }
- return;
-
- break;
- case '+':
- case '*':
- op = (*re)[-1] == '*' ? STAR : PLUS;
- if (**re == '?') {
- (*re)++;
- op = op == STAR ? STARQ : PLUSQ;
- }
- quantifier(r, last_op, op);
- break;
- case '?':
- quantifier(r, last_op, QUEST);
- break;
- case '|':
- fixup_branch(r, fixup);
- relocate(r, branch_start, 3);
- r->code[branch_start] = BRANCH;
- set_jump_offset(r, branch_start + 1, branch_start);
- fixup = branch_start + 2;
- r->code[fixup] = 0xff;
- break;
- default:
- (*re)--;
- last_op = r->code_size;
- exact(r, re);
- break;
- }
- }
- int
- slre_compile(struct slre *r, const char *re)
- {
- r->err_str = NULL;
- r->code_size = r->data_size = r->num_caps = r->anchored = 0;
- if (*re == '^')
- r->anchored++;
- emit(r, OPEN);
- emit(r, 0);
- while (*re != '\0')
- compile(r, &re);
- if (r->code[2] == BRANCH)
- fixup_branch(r, 4);
- emit(r, CLOSE);
- emit(r, 0);
- emit(r, END);
- return (r->err_str == NULL ? 1 : 0);
- }
- static int match(const struct slre *, int,
- const char *, int, int *, struct cap *);
- static void
- loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
- {
- int saved_offset, matched_offset;
- saved_offset = matched_offset = *ofs;
- while (match(r, pc + 2, s, len, ofs, NULL)) {
- saved_offset = *ofs;
- if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
- matched_offset = saved_offset;
- *ofs = saved_offset;
- }
- *ofs = matched_offset;
- }
- static void
- loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
- {
- int saved_offset = *ofs;
- while (match(r, pc + 2, s, len, ofs, NULL)) {
- saved_offset = *ofs;
- if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
- break;
- }
- *ofs = saved_offset;
- }
- static int
- is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
- {
- int i, ch;
- ch = s[*ofs];
- for (i = 0; i < len; i++)
- if (p[i] == ch) {
- (*ofs)++;
- return 1;
- }
- return 0;
- }
- static int
- is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
- {
- int i, ch;
- ch = s[*ofs];
- for (i = 0; i < len; i++) {
- if (p[i] == ch)
- return 0;
- }
- (*ofs)++;
- return 1;
- }
- static int
- match(const struct slre *r, int pc, const char *s, int len,
- int *ofs, struct cap *caps)
- {
- int n, saved_offset, res = 1;
- while (res && r->code[pc] != END) {
- assert(pc < r->code_size);
- assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
- switch (r->code[pc]) {
- case BRANCH:
- saved_offset = *ofs;
- res = match(r, pc + 3, s, len, ofs, caps);
- if (res == 0) {
- *ofs = saved_offset;
- res = match(r, pc + r->code[pc + 1],
- s, len, ofs, caps);
- }
- pc += r->code[pc + 2];
- break;
- case EXACT:
- res = 0;
- n = r->code[pc + 2];
- if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
- r->code[pc + 1], n)) {
- (*ofs) += n;
- res = 1;
- }
- pc += 3;
- break;
- case QUEST:
- res = 1;
- saved_offset = *ofs;
- if (!match(r, pc + 2, s, len, ofs, caps))
- *ofs = saved_offset;
- pc += r->code[pc + 1];
- break;
- case STAR:
- res = 1;
- loop_greedy(r, pc, s, len, ofs);
- pc += r->code[pc + 1];
- break;
- case STARQ:
- res = 1;
- loop_non_greedy(r, pc, s, len, ofs);
- pc += r->code[pc + 1];
- break;
- case PLUS:
- res = match(r, pc + 2, s, len, ofs, caps);
- if (res == 0)
- break;
- loop_greedy(r, pc, s, len, ofs);
- pc += r->code[pc + 1];
- break;
- case PLUSQ:
- res = match(r, pc + 2, s, len, ofs, caps);
- if (res == 0)
- break;
- loop_non_greedy(r, pc, s, len, ofs);
- pc += r->code[pc + 1];
- break;
- case SPACE:
- res = 0;
- if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
- (*ofs)++;
- res = 1;
- }
- pc++;
- break;
- case NONSPACE:
- res = 0;
- if (*ofs < len &&
- !isspace(((unsigned char *)s)[*ofs])) {
- (*ofs)++;
- res = 1;
- }
- pc++;
- break;
- case DIGIT:
- res = 0;
- if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
- (*ofs)++;
- res = 1;
- }
- pc++;
- break;
- case ANY:
- res = 0;
- if (*ofs < len) {
- (*ofs)++;
- res = 1;
- }
- pc++;
- break;
- case ANYOF:
- res = 0;
- if (*ofs < len)
- res = is_any_of(r->data + r->code[pc + 1],
- r->code[pc + 2], s, ofs);
- pc += 3;
- break;
- case ANYBUT:
- res = 0;
- if (*ofs < len)
- res = is_any_but(r->data + r->code[pc + 1],
- r->code[pc + 2], s, ofs);
- pc += 3;
- break;
- case BOL:
- res = *ofs == 0 ? 1 : 0;
- pc++;
- break;
- case EOL:
- res = *ofs == len ? 1 : 0;
- pc++;
- break;
- case OPEN:
- if (caps != NULL)
- caps[r->code[pc + 1]].ptr = s + *ofs;
- pc += 2;
- break;
- case CLOSE:
- if (caps != NULL)
- caps[r->code[pc + 1]].len = (s + *ofs) -
- caps[r->code[pc + 1]].ptr;
- pc += 2;
- break;
- case END:
- pc++;
- break;
- default:
- printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
- assert(0);
- break;
- }
- }
- return res;
- }
- int
- slre_match(const struct slre *r, const char *buf, int len,
- struct cap *caps)
- {
- int i, ofs = 0, res = 0;
- if (r->anchored) {
- res = match(r, 0, buf, len, &ofs, caps);
- } else {
- for (i = 0; i < len && res == 0; i++) {
- ofs = i;
- res = match(r, 0, buf, len, &ofs, caps);
- }
- }
- return res;
- }
- #ifdef SLRE_TEST
- #define N_CAPS 5
- int main(int argc, char *argv[])
- {
- struct slre slre;
- struct cap caps[N_CAPS];
- unsigned char data[1 * 1024 * 1024];
- FILE *fp;
- int i, res, len;
- if (argc < 2) {
- fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
- return 1;
- }
- fp = fopen(argv[2], "rb");
- if (fp == NULL) {
- fprintf(stderr, "Error: cannot open %s:%s\n",
- argv[2], strerror(errno));
- return 1;
- }
- if (!slre_compile(&slre, argv[1])) {
- fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
- return 1;
- }
- slre_dump(&slre, stderr);
- while (fgets(data, sizeof(data), fp) != NULL) {
- len = strlen(data);
- if ((len > 0) && (data[len-1] == '\n')) {
- data[len-1] = '\0';
- --len;
- }
- printf("Data = \"%s\"\n", data);
- (void) memset(caps, 0, sizeof(caps));
- res = 0;
- res = slre_match(&slre, data, len, caps);
- printf("Result [%d]: %d\n", i, res);
- for (i = 0; i < N_CAPS; i++) {
- if (caps[i].len > 0) {
- printf("Substring %d: len=%d [%.*s]\n", i,
- caps[i].len,
- caps[i].len, caps[i].ptr);
- }
- }
- printf("----------------------------------------------------\n");
- }
- (void) fclose(fp);
- return 0;
- }
- #endif
|