regparse.c 122 KB


  1. /**********************************************************************
  2. regparse.c - Oniguruma (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. #include "regparse.h"
  30. #include "st.h"
  31. #define WARN_BUFSIZE 256
  32. #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
  33. OnigSyntaxType OnigSyntaxRuby = {
  34. (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
  35. ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
  36. ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
  37. ONIG_SYN_OP_ESC_C_CONTROL )
  38. & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
  39. , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
  40. ONIG_SYN_OP2_OPTION_RUBY |
  41. ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
  42. ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
  43. ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
  44. ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
  45. ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
  46. ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
  47. ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
  48. ONIG_SYN_OP2_ESC_H_XDIGIT )
  49. , ( SYN_GNU_REGEX_BV |
  50. ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
  51. ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
  52. ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
  53. ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
  54. ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
  55. ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
  56. ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
  57. , ONIG_OPTION_NONE
  58. ,
  59. {
  60. (OnigCodePoint )'\\' /* esc */
  61. , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
  62. , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
  63. , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
  64. , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
  65. , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
  66. }
  67. };
  68. OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
  69. extern void onig_null_warn(const char* s ARG_UNUSED) { }
  70. #ifdef DEFAULT_WARN_FUNCTION
  71. static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
  72. #else
  73. static OnigWarnFunc onig_warn = onig_null_warn;
  74. #endif
  75. #ifdef DEFAULT_VERB_WARN_FUNCTION
  76. static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
  77. #else
  78. static OnigWarnFunc onig_verb_warn = onig_null_warn;
  79. #endif
  80. extern void onig_set_warn_func(OnigWarnFunc f)
  81. {
  82. onig_warn = f;
  83. }
  84. extern void onig_set_verb_warn_func(OnigWarnFunc f)
  85. {
  86. onig_verb_warn = f;
  87. }
  88. static void
  89. bbuf_free(BBuf* bbuf)
  90. {
  91. if (IS_NOT_NULL(bbuf)) {
  92. if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
  93. xfree(bbuf);
  94. }
  95. }
  96. static int
  97. bbuf_clone(BBuf** rto, BBuf* from)
  98. {
  99. int r;
  100. BBuf *to;
  101. *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
  102. CHECK_NULL_RETURN_MEMERR(to);
  103. r = BBUF_INIT(to, from->alloc);
  104. if (r != 0) return r;
  105. to->used = from->used;
  106. xmemcpy(to->p, from->p, from->used);
  107. return 0;
  108. }
  109. #define BACKREF_REL_TO_ABS(rel_no, env) \
  110. ((env)->num_mem + 1 + (rel_no))
  111. #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
  112. #define MBCODE_START_POS(enc) \
  113. (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
  114. #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
  115. add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
  116. #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
  117. if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
  118. r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
  119. if (r) return r;\
  120. }\
  121. } while (0)
  122. #define BITSET_IS_EMPTY(bs,empty) do {\
  123. int i;\
  124. empty = 1;\
  125. for (i = 0; i < (int )BITSET_SIZE; i++) {\
  126. if ((bs)[i] != 0) {\
  127. empty = 0; break;\
  128. }\
  129. }\
  130. } while (0)
  131. static void
  132. bitset_set_range(BitSetRef bs, int from, int to)
  133. {
  134. int i;
  135. for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
  136. BITSET_SET_BIT(bs, i);
  137. }
  138. }
  139. #if 0
  140. static void
  141. bitset_set_all(BitSetRef bs)
  142. {
  143. int i;
  144. for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
  145. }
  146. #endif
  147. static void
  148. bitset_invert(BitSetRef bs)
  149. {
  150. int i;
  151. for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
  152. }
  153. static void
  154. bitset_invert_to(BitSetRef from, BitSetRef to)
  155. {
  156. int i;
  157. for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
  158. }
  159. static void
  160. bitset_and(BitSetRef dest, BitSetRef bs)
  161. {
  162. int i;
  163. for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
  164. }
  165. static void
  166. bitset_or(BitSetRef dest, BitSetRef bs)
  167. {
  168. int i;
  169. for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
  170. }
  171. static void
  172. bitset_copy(BitSetRef dest, BitSetRef bs)
  173. {
  174. int i;
  175. for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
  176. }
  177. extern int
  178. onig_strncmp(const UChar* s1, const UChar* s2, int n)
  179. {
  180. int x;
  181. while (n-- > 0) {
  182. x = *s2++ - *s1++;
  183. if (x) return x;
  184. }
  185. return 0;
  186. }
  187. extern void
  188. onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
  189. {
  190. int len = end - src;
  191. if (len > 0) {
  192. xmemcpy(dest, src, len);
  193. dest[len] = (UChar )0;
  194. }
  195. }
  196. #ifdef USE_NAMED_GROUP
  197. static UChar*
  198. strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
  199. {
  200. int slen, term_len, i;
  201. UChar *r;
  202. slen = end - s;
  203. term_len = ONIGENC_MBC_MINLEN(enc);
  204. r = (UChar* )xmalloc(slen + term_len);
  205. CHECK_NULL_RETURN(r);
  206. xmemcpy(r, s, slen);
  207. for (i = 0; i < term_len; i++)
  208. r[slen + i] = (UChar )0;
  209. return r;
  210. }
  211. #endif
  212. /* scan pattern methods */
  213. #define PEND_VALUE 0
  214. #define PFETCH_READY UChar* pfetch_prev
  215. #define PEND (p < end ? 0 : 1)
  216. #define PUNFETCH p = pfetch_prev
  217. #define PINC do { \
  218. pfetch_prev = p; \
  219. p += ONIGENC_MBC_ENC_LEN(enc, p); \
  220. } while (0)
  221. #define PFETCH(c) do { \
  222. c = ONIGENC_MBC_TO_CODE(enc, p, end); \
  223. pfetch_prev = p; \
  224. p += ONIGENC_MBC_ENC_LEN(enc, p); \
  225. if(UNEXPECTED(p > end)) p = end; \
  226. } while (0)
  227. #define PINC_S do { \
  228. p += ONIGENC_MBC_ENC_LEN(enc, p); \
  229. if(UNEXPECTED(p > end)) p = end; \
  230. } while (0)
  231. #define PFETCH_S(c) do { \
  232. c = ONIGENC_MBC_TO_CODE(enc, p, end); \
  233. p += ONIGENC_MBC_ENC_LEN(enc, p); \
  234. if(UNEXPECTED(p > end)) p = end; \
  235. } while (0)
  236. #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
  237. #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
  238. static UChar*
  239. strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
  240. int capa)
  241. {
  242. UChar* r;
  243. if (dest)
  244. r = (UChar* )xrealloc(dest, capa + 1);
  245. else
  246. r = (UChar* )xmalloc(capa + 1);
  247. CHECK_NULL_RETURN(r);
  248. onig_strcpy(r + (dest_end - dest), src, src_end);
  249. return r;
  250. }
  251. /* dest on static area */
  252. static UChar*
  253. strcat_capa_from_static(UChar* dest, UChar* dest_end,
  254. const UChar* src, const UChar* src_end, int capa)
  255. {
  256. UChar* r;
  257. r = (UChar* )xmalloc(capa + 1);
  258. CHECK_NULL_RETURN(r);
  259. onig_strcpy(r, dest, dest_end);
  260. onig_strcpy(r + (dest_end - dest), src, src_end);
  261. return r;
  262. }
  263. #ifdef USE_ST_LIBRARY
  264. typedef struct {
  265. UChar* s;
  266. UChar* end;
  267. } st_str_end_key;
  268. static int
  269. str_end_cmp(st_str_end_key* x, st_str_end_key* y)
  270. {
  271. UChar *p, *q;
  272. int c;
  273. if ((x->end - x->s) != (y->end - y->s))
  274. return 1;
  275. p = x->s;
  276. q = y->s;
  277. while (p < x->end) {
  278. c = (int )*p - (int )*q;
  279. if (c != 0) return c;
  280. p++; q++;
  281. }
  282. return 0;
  283. }
  284. static int
  285. str_end_hash(st_str_end_key* x)
  286. {
  287. UChar *p;
  288. int val = 0;
  289. p = x->s;
  290. while (p < x->end) {
  291. val = val * 997 + (int )*p++;
  292. }
  293. return val + (val >> 5);
  294. }
  295. extern hash_table_type*
  296. onig_st_init_strend_table_with_size(int size)
  297. {
  298. static struct st_hash_type hashType = {
  299. str_end_cmp,
  300. str_end_hash,
  301. };
  302. return (hash_table_type* )
  303. onig_st_init_table_with_size(&hashType, size);
  304. }
  305. extern int
  306. onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
  307. const UChar* end_key, hash_data_type *value)
  308. {
  309. st_str_end_key key;
  310. key.s = (UChar* )str_key;
  311. key.end = (UChar* )end_key;
  312. return onig_st_lookup(table, (st_data_t )(&key), value);
  313. }
  314. extern int
  315. onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
  316. const UChar* end_key, hash_data_type value)
  317. {
  318. st_str_end_key* key;
  319. int result;
  320. key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
  321. key->s = (UChar* )str_key;
  322. key->end = (UChar* )end_key;
  323. result = onig_st_insert(table, (st_data_t )key, value);
  324. if (result) {
  325. xfree(key);
  326. }
  327. return result;
  328. }
  329. #endif /* USE_ST_LIBRARY */
  330. #ifdef USE_NAMED_GROUP
  331. #define INIT_NAME_BACKREFS_ALLOC_NUM 8
  332. typedef struct {
  333. UChar* name;
  334. int name_len; /* byte length */
  335. int back_num; /* number of backrefs */
  336. int back_alloc;
  337. int back_ref1;
  338. int* back_refs;
  339. } NameEntry;
  340. #ifdef USE_ST_LIBRARY
  341. typedef st_table NameTable;
  342. typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
  343. #define NAMEBUF_SIZE 24
  344. #define NAMEBUF_SIZE_1 25
  345. #ifdef ONIG_DEBUG
  346. static int
  347. i_print_name_entry(UChar* key, NameEntry* e, void* arg)
  348. {
  349. int i;
  350. FILE* fp = (FILE* )arg;
  351. fprintf(fp, "%s: ", e->name);
  352. if (e->back_num == 0)
  353. fputs("-", fp);
  354. else if (e->back_num == 1)
  355. fprintf(fp, "%d", e->back_ref1);
  356. else {
  357. for (i = 0; i < e->back_num; i++) {
  358. if (i > 0) fprintf(fp, ", ");
  359. fprintf(fp, "%d", e->back_refs[i]);
  360. }
  361. }
  362. fputs("\n", fp);
  363. return ST_CONTINUE;
  364. }
  365. extern int
  366. onig_print_names(FILE* fp, regex_t* reg)
  367. {
  368. NameTable* t = (NameTable* )reg->name_table;
  369. if (IS_NOT_NULL(t)) {
  370. fprintf(fp, "name table\n");
  371. onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
  372. fputs("\n", fp);
  373. }
  374. return 0;
  375. }
  376. #endif /* ONIG_DEBUG */
  377. static int
  378. i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
  379. {
  380. xfree(e->name);
  381. if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
  382. xfree(key);
  383. xfree(e);
  384. return ST_DELETE;
  385. }
  386. static int
  387. names_clear(regex_t* reg)
  388. {
  389. NameTable* t = (NameTable* )reg->name_table;
  390. if (IS_NOT_NULL(t)) {
  391. onig_st_foreach(t, i_free_name_entry, 0);
  392. }
  393. return 0;
  394. }
  395. extern int
  396. onig_names_free(regex_t* reg)
  397. {
  398. int r;
  399. NameTable* t;
  400. r = names_clear(reg);
  401. if (r) return r;
  402. t = (NameTable* )reg->name_table;
  403. if (IS_NOT_NULL(t)) onig_st_free_table(t);
  404. reg->name_table = (void* )NULL;
  405. return 0;
  406. }
  407. static NameEntry*
  408. name_find(regex_t* reg, const UChar* name, const UChar* name_end)
  409. {
  410. NameEntry* e;
  411. NameTable* t = (NameTable* )reg->name_table;
  412. e = (NameEntry* )NULL;
  413. if (IS_NOT_NULL(t)) {
  414. onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
  415. }
  416. return e;
  417. }
  418. typedef struct {
  419. int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
  420. regex_t* reg;
  421. void* arg;
  422. int ret;
  423. OnigEncoding enc;
  424. } INamesArg;
  425. static int
  426. i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
  427. {
  428. int r = (*(arg->func))(e->name,
  429. e->name + e->name_len,
  430. e->back_num,
  431. (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
  432. arg->reg, arg->arg);
  433. if (r != 0) {
  434. arg->ret = r;
  435. return ST_STOP;
  436. }
  437. return ST_CONTINUE;
  438. }
  439. extern int
  440. onig_foreach_name(regex_t* reg,
  441. int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
  442. {
  443. INamesArg narg;
  444. NameTable* t = (NameTable* )reg->name_table;
  445. narg.ret = 0;
  446. if (IS_NOT_NULL(t)) {
  447. narg.func = func;
  448. narg.reg = reg;
  449. narg.arg = arg;
  450. narg.enc = reg->enc; /* should be pattern encoding. */
  451. onig_st_foreach(t, i_names, (HashDataType )&narg);
  452. }
  453. return narg.ret;
  454. }
  455. static int
  456. i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
  457. {
  458. int i;
  459. if (e->back_num > 1) {
  460. for (i = 0; i < e->back_num; i++) {
  461. e->back_refs[i] = map[e->back_refs[i]].new_val;
  462. }
  463. }
  464. else if (e->back_num == 1) {
  465. e->back_ref1 = map[e->back_ref1].new_val;
  466. }
  467. return ST_CONTINUE;
  468. }
  469. extern int
  470. onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
  471. {
  472. NameTable* t = (NameTable* )reg->name_table;
  473. if (IS_NOT_NULL(t)) {
  474. onig_st_foreach(t, i_renumber_name, (HashDataType )map);
  475. }
  476. return 0;
  477. }
  478. extern int
  479. onig_number_of_names(regex_t* reg)
  480. {
  481. NameTable* t = (NameTable* )reg->name_table;
  482. if (IS_NOT_NULL(t))
  483. return t->num_entries;
  484. else
  485. return 0;
  486. }
  487. #else /* USE_ST_LIBRARY */
  488. #define INIT_NAMES_ALLOC_NUM 8
  489. typedef struct {
  490. NameEntry* e;
  491. int num;
  492. int alloc;
  493. } NameTable;
  494. #ifdef ONIG_DEBUG
  495. extern int
  496. onig_print_names(FILE* fp, regex_t* reg)
  497. {
  498. int i, j;
  499. NameEntry* e;
  500. NameTable* t = (NameTable* )reg->name_table;
  501. if (IS_NOT_NULL(t) && t->num > 0) {
  502. fprintf(fp, "name table\n");
  503. for (i = 0; i < t->num; i++) {
  504. e = &(t->e[i]);
  505. fprintf(fp, "%s: ", e->name);
  506. if (e->back_num == 0) {
  507. fputs("-", fp);
  508. }
  509. else if (e->back_num == 1) {
  510. fprintf(fp, "%d", e->back_ref1);
  511. }
  512. else {
  513. for (j = 0; j < e->back_num; j++) {
  514. if (j > 0) fprintf(fp, ", ");
  515. fprintf(fp, "%d", e->back_refs[j]);
  516. }
  517. }
  518. fputs("\n", fp);
  519. }
  520. fputs("\n", fp);
  521. }
  522. return 0;
  523. }
  524. #endif
  525. static int
  526. names_clear(regex_t* reg)
  527. {
  528. int i;
  529. NameEntry* e;
  530. NameTable* t = (NameTable* )reg->name_table;
  531. if (IS_NOT_NULL(t)) {
  532. for (i = 0; i < t->num; i++) {
  533. e = &(t->e[i]);
  534. if (IS_NOT_NULL(e->name)) {
  535. xfree(e->name);
  536. e->name = NULL;
  537. e->name_len = 0;
  538. e->back_num = 0;
  539. e->back_alloc = 0;
  540. if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
  541. e->back_refs = (int* )NULL;
  542. }
  543. }
  544. if (IS_NOT_NULL(t->e)) {
  545. xfree(t->e);
  546. t->e = NULL;
  547. }
  548. t->num = 0;
  549. }
  550. return 0;
  551. }
  552. extern int
  553. onig_names_free(regex_t* reg)
  554. {
  555. int r;
  556. NameTable* t;
  557. r = names_clear(reg);
  558. if (r) return r;
  559. t = (NameTable* )reg->name_table;
  560. if (IS_NOT_NULL(t)) xfree(t);
  561. reg->name_table = NULL;
  562. return 0;
  563. }
  564. static NameEntry*
  565. name_find(regex_t* reg, UChar* name, UChar* name_end)
  566. {
  567. int i, len;
  568. NameEntry* e;
  569. NameTable* t = (NameTable* )reg->name_table;
  570. if (IS_NOT_NULL(t)) {
  571. len = name_end - name;
  572. for (i = 0; i < t->num; i++) {
  573. e = &(t->e[i]);
  574. if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
  575. return e;
  576. }
  577. }
  578. return (NameEntry* )NULL;
  579. }
  580. extern int
  581. onig_foreach_name(regex_t* reg,
  582. int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
  583. {
  584. int i, r;
  585. NameEntry* e;
  586. NameTable* t = (NameTable* )reg->name_table;
  587. if (IS_NOT_NULL(t)) {
  588. for (i = 0; i < t->num; i++) {
  589. e = &(t->e[i]);
  590. r = (*func)(e->name, e->name + e->name_len, e->back_num,
  591. (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
  592. reg, arg);
  593. if (r != 0) return r;
  594. }
  595. }
  596. return 0;
  597. }
  598. extern int
  599. onig_number_of_names(regex_t* reg)
  600. {
  601. NameTable* t = (NameTable* )reg->name_table;
  602. if (IS_NOT_NULL(t))
  603. return t->num;
  604. else
  605. return 0;
  606. }
  607. #endif /* else USE_ST_LIBRARY */
  608. static int
  609. name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
  610. {
  611. int alloc;
  612. NameEntry* e;
  613. NameTable* t = (NameTable* )reg->name_table;
  614. if (name_end - name <= 0)
  615. return ONIGERR_EMPTY_GROUP_NAME;
  616. e = name_find(reg, name, name_end);
  617. if (IS_NULL(e)) {
  618. #ifdef USE_ST_LIBRARY
  619. if (IS_NULL(t)) {
  620. t = onig_st_init_strend_table_with_size(5);
  621. reg->name_table = (void* )t;
  622. }
  623. e = (NameEntry* )xmalloc(sizeof(NameEntry));
  624. CHECK_NULL_RETURN_MEMERR(e);
  625. e->name = strdup_with_null(reg->enc, name, name_end);
  626. if (IS_NULL(e->name)) {
  627. xfree(e); return ONIGERR_MEMORY;
  628. }
  629. onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
  630. (HashDataType )e);
  631. e->name_len = name_end - name;
  632. e->back_num = 0;
  633. e->back_alloc = 0;
  634. e->back_refs = (int* )NULL;
  635. #else
  636. if (IS_NULL(t)) {
  637. alloc = INIT_NAMES_ALLOC_NUM;
  638. t = (NameTable* )xmalloc(sizeof(NameTable));
  639. CHECK_NULL_RETURN_MEMERR(t);
  640. t->e = NULL;
  641. t->alloc = 0;
  642. t->num = 0;
  643. t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
  644. if (IS_NULL(t->e)) {
  645. xfree(t);
  646. return ONIGERR_MEMORY;
  647. }
  648. t->alloc = alloc;
  649. reg->name_table = t;
  650. goto clear;
  651. }
  652. else if (t->num == t->alloc) {
  653. int i;
  654. alloc = t->alloc * 2;
  655. t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
  656. CHECK_NULL_RETURN_MEMERR(t->e);
  657. t->alloc = alloc;
  658. clear:
  659. for (i = t->num; i < t->alloc; i++) {
  660. t->e[i].name = NULL;
  661. t->e[i].name_len = 0;
  662. t->e[i].back_num = 0;
  663. t->e[i].back_alloc = 0;
  664. t->e[i].back_refs = (int* )NULL;
  665. }
  666. }
  667. e = &(t->e[t->num]);
  668. t->num++;
  669. e->name = strdup_with_null(reg->enc, name, name_end);
  670. if (IS_NULL(e->name)) return ONIGERR_MEMORY;
  671. e->name_len = name_end - name;
  672. #endif
  673. }
  674. if (e->back_num >= 1 &&
  675. ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
  676. onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
  677. name, name_end);
  678. return ONIGERR_MULTIPLEX_DEFINED_NAME;
  679. }
  680. e->back_num++;
  681. if (e->back_num == 1) {
  682. e->back_ref1 = backref;
  683. }
  684. else {
  685. if (e->back_num == 2) {
  686. alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
  687. e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
  688. CHECK_NULL_RETURN_MEMERR(e->back_refs);
  689. e->back_alloc = alloc;
  690. e->back_refs[0] = e->back_ref1;
  691. e->back_refs[1] = backref;
  692. }
  693. else {
  694. if (e->back_num > e->back_alloc) {
  695. alloc = e->back_alloc * 2;
  696. e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
  697. CHECK_NULL_RETURN_MEMERR(e->back_refs);
  698. e->back_alloc = alloc;
  699. }
  700. e->back_refs[e->back_num - 1] = backref;
  701. }
  702. }
  703. return 0;
  704. }
  705. extern int
  706. onig_name_to_group_numbers(regex_t* reg, const UChar* name,
  707. const UChar* name_end, int** nums)
  708. {
  709. NameEntry* e = name_find(reg, name, name_end);
  710. if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
  711. switch (e->back_num) {
  712. case 0:
  713. break;
  714. case 1:
  715. *nums = &(e->back_ref1);
  716. break;
  717. default:
  718. *nums = e->back_refs;
  719. break;
  720. }
  721. return e->back_num;
  722. }
  723. extern int
  724. onig_name_to_backref_number(regex_t* reg, const UChar* name,
  725. const UChar* name_end, OnigRegion *region)
  726. {
  727. int i, n, *nums;
  728. n = onig_name_to_group_numbers(reg, name, name_end, &nums);
  729. if (n < 0)
  730. return n;
  731. else if (n == 0)
  732. return ONIGERR_PARSER_BUG;
  733. else if (n == 1)
  734. return nums[0];
  735. else {
  736. if (IS_NOT_NULL(region)) {
  737. for (i = n - 1; i >= 0; i--) {
  738. if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
  739. return nums[i];
  740. }
  741. }
  742. return nums[n - 1];
  743. }
  744. }
  745. #else /* USE_NAMED_GROUP */
  746. extern int
  747. onig_name_to_group_numbers(regex_t* reg, const UChar* name,
  748. const UChar* name_end, int** nums)
  749. {
  750. return ONIG_NO_SUPPORT_CONFIG;
  751. }
  752. extern int
  753. onig_name_to_backref_number(regex_t* reg, const UChar* name,
  754. const UChar* name_end, OnigRegion* region)
  755. {
  756. return ONIG_NO_SUPPORT_CONFIG;
  757. }
  758. extern int
  759. onig_foreach_name(regex_t* reg,
  760. int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
  761. {
  762. return ONIG_NO_SUPPORT_CONFIG;
  763. }
  764. extern int
  765. onig_number_of_names(regex_t* reg)
  766. {
  767. return 0;
  768. }
  769. #endif /* else USE_NAMED_GROUP */
  770. extern int
  771. onig_noname_group_capture_is_active(regex_t* reg)
  772. {
  773. if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
  774. return 0;
  775. #ifdef USE_NAMED_GROUP
  776. if (onig_number_of_names(reg) > 0 &&
  777. IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
  778. !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
  779. return 0;
  780. }
  781. #endif
  782. return 1;
  783. }
  784. #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
  785. static void
  786. scan_env_clear(ScanEnv* env)
  787. {
  788. int i;
  789. BIT_STATUS_CLEAR(env->capture_history);
  790. BIT_STATUS_CLEAR(env->bt_mem_start);
  791. BIT_STATUS_CLEAR(env->bt_mem_end);
  792. BIT_STATUS_CLEAR(env->backrefed_mem);
  793. env->error = (UChar* )NULL;
  794. env->error_end = (UChar* )NULL;
  795. env->num_call = 0;
  796. env->num_mem = 0;
  797. #ifdef USE_NAMED_GROUP
  798. env->num_named = 0;
  799. #endif
  800. env->mem_alloc = 0;
  801. env->mem_nodes_dynamic = (Node** )NULL;
  802. for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
  803. env->mem_nodes_static[i] = NULL_NODE;
  804. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  805. env->num_comb_exp_check = 0;
  806. env->comb_exp_max_regnum = 0;
  807. env->curr_max_regnum = 0;
  808. env->has_recursion = 0;
  809. #endif
  810. }
  811. static int
  812. scan_env_add_mem_entry(ScanEnv* env)
  813. {
  814. int i, need, alloc;
  815. Node** p;
  816. need = env->num_mem + 1;
  817. if (need >= SCANENV_MEMNODES_SIZE) {
  818. if (env->mem_alloc <= need) {
  819. if (IS_NULL(env->mem_nodes_dynamic)) {
  820. alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
  821. p = (Node** )xmalloc(sizeof(Node*) * alloc);
  822. xmemcpy(p, env->mem_nodes_static,
  823. sizeof(Node*) * SCANENV_MEMNODES_SIZE);
  824. }
  825. else {
  826. alloc = env->mem_alloc * 2;
  827. p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
  828. }
  829. CHECK_NULL_RETURN_MEMERR(p);
  830. for (i = env->num_mem + 1; i < alloc; i++)
  831. p[i] = NULL_NODE;
  832. env->mem_nodes_dynamic = p;
  833. env->mem_alloc = alloc;
  834. }
  835. }
  836. env->num_mem++;
  837. return env->num_mem;
  838. }
  839. static int
  840. scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
  841. {
  842. if (env->num_mem >= num)
  843. SCANENV_MEM_NODES(env)[num] = node;
  844. else
  845. return ONIGERR_PARSER_BUG;
  846. return 0;
  847. }
  848. #ifdef USE_PARSE_TREE_NODE_RECYCLE
  849. typedef struct _FreeNode {
  850. struct _FreeNode* next;
  851. } FreeNode;
  852. static FreeNode* FreeNodeList = (FreeNode* )NULL;
  853. #endif
  854. extern void
  855. onig_node_free(Node* node)
  856. {
  857. start:
  858. if (IS_NULL(node)) return ;
  859. switch (NTYPE(node)) {
  860. case NT_STR:
  861. if (NSTR(node)->capa != 0 &&
  862. IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
  863. xfree(NSTR(node)->s);
  864. }
  865. break;
  866. case NT_LIST:
  867. case NT_ALT:
  868. onig_node_free(NCAR(node));
  869. {
  870. Node* next_node = NCDR(node);
  871. #ifdef USE_PARSE_TREE_NODE_RECYCLE
  872. {
  873. FreeNode* n = (FreeNode* )node;
  874. THREAD_ATOMIC_START;
  875. n->next = FreeNodeList;
  876. FreeNodeList = n;
  877. THREAD_ATOMIC_END;
  878. }
  879. #else
  880. xfree(node);
  881. #endif
  882. node = next_node;
  883. goto start;
  884. }
  885. break;
  886. case NT_CCLASS:
  887. {
  888. CClassNode* cc = NCCLASS(node);
  889. if (IS_NCCLASS_SHARE(cc)) return ;
  890. if (cc->mbuf)
  891. bbuf_free(cc->mbuf);
  892. }
  893. break;
  894. case NT_QTFR:
  895. if (NQTFR(node)->target)
  896. onig_node_free(NQTFR(node)->target);
  897. break;
  898. case NT_ENCLOSE:
  899. if (NENCLOSE(node)->target)
  900. onig_node_free(NENCLOSE(node)->target);
  901. break;
  902. case NT_BREF:
  903. if (IS_NOT_NULL(NBREF(node)->back_dynamic))
  904. xfree(NBREF(node)->back_dynamic);
  905. break;
  906. case NT_ANCHOR:
  907. if (NANCHOR(node)->target)
  908. onig_node_free(NANCHOR(node)->target);
  909. break;
  910. }
  911. #ifdef USE_PARSE_TREE_NODE_RECYCLE
  912. {
  913. FreeNode* n = (FreeNode* )node;
  914. THREAD_ATOMIC_START;
  915. n->next = FreeNodeList;
  916. FreeNodeList = n;
  917. THREAD_ATOMIC_END;
  918. }
  919. #else
  920. xfree(node);
  921. #endif
  922. }
  923. #ifdef USE_PARSE_TREE_NODE_RECYCLE
  924. extern int
  925. onig_free_node_list(void)
  926. {
  927. FreeNode* n;
  928. /* THREAD_ATOMIC_START; */
  929. while (IS_NOT_NULL(FreeNodeList)) {
  930. n = FreeNodeList;
  931. FreeNodeList = FreeNodeList->next;
  932. xfree(n);
  933. }
  934. /* THREAD_ATOMIC_END; */
  935. return 0;
  936. }
  937. #endif
  938. static Node*
  939. node_new(void)
  940. {
  941. Node* node;
  942. #ifdef USE_PARSE_TREE_NODE_RECYCLE
  943. THREAD_ATOMIC_START;
  944. if (IS_NOT_NULL(FreeNodeList)) {
  945. node = (Node* )FreeNodeList;
  946. FreeNodeList = FreeNodeList->next;
  947. THREAD_ATOMIC_END;
  948. return node;
  949. }
  950. THREAD_ATOMIC_END;
  951. #endif
  952. node = (Node* )xmalloc(sizeof(Node));
  953. /* xmemset(node, 0, sizeof(Node)); */
  954. return node;
  955. }
  956. static void
  957. initialize_cclass(CClassNode* cc)
  958. {
  959. BITSET_CLEAR(cc->bs);
  960. /* cc->base.flags = 0; */
  961. cc->flags = 0;
  962. cc->mbuf = NULL;
  963. }
  964. static Node*
  965. node_new_cclass(void)
  966. {
  967. Node* node = node_new();
  968. CHECK_NULL_RETURN(node);
  969. SET_NTYPE(node, NT_CCLASS);
  970. initialize_cclass(NCCLASS(node));
  971. return node;
  972. }
  973. static Node*
  974. node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
  975. const OnigCodePoint ranges[])
  976. {
  977. int n, i;
  978. CClassNode* cc;
  979. OnigCodePoint j;
  980. Node* node = node_new_cclass();
  981. CHECK_NULL_RETURN(node);
  982. cc = NCCLASS(node);
  983. if (not != 0) NCCLASS_SET_NOT(cc);
  984. BITSET_CLEAR(cc->bs);
  985. if (sb_out > 0 && IS_NOT_NULL(ranges)) {
  986. n = ONIGENC_CODE_RANGE_NUM(ranges);
  987. for (i = 0; i < n; i++) {
  988. for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
  989. j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
  990. if (j >= sb_out) goto sb_end;
  991. BITSET_SET_BIT(cc->bs, j);
  992. }
  993. }
  994. }
  995. sb_end:
  996. if (IS_NULL(ranges)) {
  997. is_null:
  998. cc->mbuf = NULL;
  999. }
  1000. else {
  1001. BBuf* bbuf;
  1002. n = ONIGENC_CODE_RANGE_NUM(ranges);
  1003. if (n == 0) goto is_null;
  1004. bbuf = (BBuf* )xmalloc(sizeof(BBuf));
  1005. CHECK_NULL_RETURN(bbuf);
  1006. bbuf->alloc = n + 1;
  1007. bbuf->used = n + 1;
  1008. bbuf->p = (UChar* )((void* )ranges);
  1009. cc->mbuf = bbuf;
  1010. }
  1011. return node;
  1012. }
  1013. static Node*
  1014. node_new_ctype(int type, int not)
  1015. {
  1016. Node* node = node_new();
  1017. CHECK_NULL_RETURN(node);
  1018. SET_NTYPE(node, NT_CTYPE);
  1019. NCTYPE(node)->ctype = type;
  1020. NCTYPE(node)->not = not;
  1021. return node;
  1022. }
  1023. static Node*
  1024. node_new_anychar(void)
  1025. {
  1026. Node* node = node_new();
  1027. CHECK_NULL_RETURN(node);
  1028. SET_NTYPE(node, NT_CANY);
  1029. return node;
  1030. }
  1031. static Node*
  1032. node_new_list(Node* left, Node* right)
  1033. {
  1034. Node* node = node_new();
  1035. CHECK_NULL_RETURN(node);
  1036. SET_NTYPE(node, NT_LIST);
  1037. NCAR(node) = left;
  1038. NCDR(node) = right;
  1039. return node;
  1040. }
  1041. extern Node*
  1042. onig_node_new_list(Node* left, Node* right)
  1043. {
  1044. return node_new_list(left, right);
  1045. }
  1046. extern Node*
  1047. onig_node_list_add(Node* list, Node* x)
  1048. {
  1049. Node *n;
  1050. n = onig_node_new_list(x, NULL);
  1051. if (IS_NULL(n)) return NULL_NODE;
  1052. if (IS_NOT_NULL(list)) {
  1053. while (IS_NOT_NULL(NCDR(list)))
  1054. list = NCDR(list);
  1055. NCDR(list) = n;
  1056. }
  1057. return n;
  1058. }
  1059. extern Node*
  1060. onig_node_new_alt(Node* left, Node* right)
  1061. {
  1062. Node* node = node_new();
  1063. CHECK_NULL_RETURN(node);
  1064. SET_NTYPE(node, NT_ALT);
  1065. NCAR(node) = left;
  1066. NCDR(node) = right;
  1067. return node;
  1068. }
  1069. extern Node*
  1070. onig_node_new_anchor(int type)
  1071. {
  1072. Node* node = node_new();
  1073. CHECK_NULL_RETURN(node);
  1074. SET_NTYPE(node, NT_ANCHOR);
  1075. NANCHOR(node)->type = type;
  1076. NANCHOR(node)->target = NULL;
  1077. NANCHOR(node)->char_len = -1;
  1078. return node;
  1079. }
  1080. static Node*
  1081. node_new_backref(int back_num, int* backrefs, int by_name,
  1082. #ifdef USE_BACKREF_WITH_LEVEL
  1083. int exist_level, int nest_level,
  1084. #endif
  1085. ScanEnv* env)
  1086. {
  1087. int i;
  1088. Node* node = node_new();
  1089. CHECK_NULL_RETURN(node);
  1090. SET_NTYPE(node, NT_BREF);
  1091. NBREF(node)->state = 0;
  1092. NBREF(node)->back_num = back_num;
  1093. NBREF(node)->back_dynamic = (int* )NULL;
  1094. if (by_name != 0)
  1095. NBREF(node)->state |= NST_NAME_REF;
  1096. #ifdef USE_BACKREF_WITH_LEVEL
  1097. if (exist_level != 0) {
  1098. NBREF(node)->state |= NST_NEST_LEVEL;
  1099. NBREF(node)->nest_level = nest_level;
  1100. }
  1101. #endif
  1102. for (i = 0; i < back_num; i++) {
  1103. if (backrefs[i] <= env->num_mem &&
  1104. IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
  1105. NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
  1106. break;
  1107. }
  1108. }
  1109. if (back_num <= NODE_BACKREFS_SIZE) {
  1110. for (i = 0; i < back_num; i++)
  1111. NBREF(node)->back_static[i] = backrefs[i];
  1112. }
  1113. else {
  1114. int* p = (int* )xmalloc(sizeof(int) * back_num);
  1115. if (IS_NULL(p)) {
  1116. onig_node_free(node);
  1117. return NULL;
  1118. }
  1119. NBREF(node)->back_dynamic = p;
  1120. for (i = 0; i < back_num; i++)
  1121. p[i] = backrefs[i];
  1122. }
  1123. return node;
  1124. }
  1125. #ifdef USE_SUBEXP_CALL
  1126. static Node*
  1127. node_new_call(UChar* name, UChar* name_end, int gnum)
  1128. {
  1129. Node* node = node_new();
  1130. CHECK_NULL_RETURN(node);
  1131. SET_NTYPE(node, NT_CALL);
  1132. NCALL(node)->state = 0;
  1133. NCALL(node)->target = NULL_NODE;
  1134. NCALL(node)->name = name;
  1135. NCALL(node)->name_end = name_end;
  1136. NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
  1137. return node;
  1138. }
  1139. #endif
  1140. static Node*
  1141. node_new_quantifier(int lower, int upper, int by_number)
  1142. {
  1143. Node* node = node_new();
  1144. CHECK_NULL_RETURN(node);
  1145. SET_NTYPE(node, NT_QTFR);
  1146. NQTFR(node)->state = 0;
  1147. NQTFR(node)->target = NULL;
  1148. NQTFR(node)->lower = lower;
  1149. NQTFR(node)->upper = upper;
  1150. NQTFR(node)->greedy = 1;
  1151. NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
  1152. NQTFR(node)->head_exact = NULL_NODE;
  1153. NQTFR(node)->next_head_exact = NULL_NODE;
  1154. NQTFR(node)->is_refered = 0;
  1155. if (by_number != 0)
  1156. NQTFR(node)->state |= NST_BY_NUMBER;
  1157. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  1158. NQTFR(node)->comb_exp_check_num = 0;
  1159. #endif
  1160. return node;
  1161. }
  1162. static Node*
  1163. node_new_enclose(int type)
  1164. {
  1165. Node* node = node_new();
  1166. CHECK_NULL_RETURN(node);
  1167. SET_NTYPE(node, NT_ENCLOSE);
  1168. NENCLOSE(node)->type = type;
  1169. NENCLOSE(node)->state = 0;
  1170. NENCLOSE(node)->regnum = 0;
  1171. NENCLOSE(node)->option = 0;
  1172. NENCLOSE(node)->target = NULL;
  1173. NENCLOSE(node)->call_addr = -1;
  1174. NENCLOSE(node)->opt_count = 0;
  1175. return node;
  1176. }
  1177. extern Node*
  1178. onig_node_new_enclose(int type)
  1179. {
  1180. return node_new_enclose(type);
  1181. }
  1182. static Node*
  1183. node_new_enclose_memory(OnigOptionType option, int is_named)
  1184. {
  1185. Node* node = node_new_enclose(ENCLOSE_MEMORY);
  1186. CHECK_NULL_RETURN(node);
  1187. if (is_named != 0)
  1188. SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
  1189. #ifdef USE_SUBEXP_CALL
  1190. NENCLOSE(node)->option = option;
  1191. #endif
  1192. return node;
  1193. }
  1194. static Node*
  1195. node_new_option(OnigOptionType option)
  1196. {
  1197. Node* node = node_new_enclose(ENCLOSE_OPTION);
  1198. CHECK_NULL_RETURN(node);
  1199. NENCLOSE(node)->option = option;
  1200. return node;
  1201. }
  1202. extern int
  1203. onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
  1204. {
  1205. int addlen = end - s;
  1206. if (addlen > 0) {
  1207. int len = NSTR(node)->end - NSTR(node)->s;
  1208. if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
  1209. UChar* p;
  1210. int capa = len + addlen + NODE_STR_MARGIN;
  1211. if (capa <= NSTR(node)->capa) {
  1212. onig_strcpy(NSTR(node)->s + len, s, end);
  1213. }
  1214. else {
  1215. if (NSTR(node)->s == NSTR(node)->buf)
  1216. p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
  1217. s, end, capa);
  1218. else
  1219. p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
  1220. CHECK_NULL_RETURN_MEMERR(p);
  1221. NSTR(node)->s = p;
  1222. NSTR(node)->capa = capa;
  1223. }
  1224. }
  1225. else {
  1226. onig_strcpy(NSTR(node)->s + len, s, end);
  1227. }
  1228. NSTR(node)->end = NSTR(node)->s + len + addlen;
  1229. }
  1230. return 0;
  1231. }
  1232. extern int
  1233. onig_node_str_set(Node* node, const UChar* s, const UChar* end)
  1234. {
  1235. onig_node_str_clear(node);
  1236. return onig_node_str_cat(node, s, end);
  1237. }
  1238. static int
  1239. node_str_cat_char(Node* node, UChar c)
  1240. {
  1241. UChar s[1];
  1242. s[0] = c;
  1243. return onig_node_str_cat(node, s, s + 1);
  1244. }
  1245. extern void
  1246. onig_node_conv_to_str_node(Node* node, int flag)
  1247. {
  1248. SET_NTYPE(node, NT_STR);
  1249. NSTR(node)->flag = flag;
  1250. NSTR(node)->capa = 0;
  1251. NSTR(node)->s = NSTR(node)->buf;
  1252. NSTR(node)->end = NSTR(node)->buf;
  1253. }
  1254. extern void
  1255. onig_node_str_clear(Node* node)
  1256. {
  1257. if (NSTR(node)->capa != 0 &&
  1258. IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
  1259. xfree(NSTR(node)->s);
  1260. }
  1261. NSTR(node)->capa = 0;
  1262. NSTR(node)->flag = 0;
  1263. NSTR(node)->s = NSTR(node)->buf;
  1264. NSTR(node)->end = NSTR(node)->buf;
  1265. }
  1266. static Node*
  1267. node_new_str(const UChar* s, const UChar* end)
  1268. {
  1269. Node* node = node_new();
  1270. CHECK_NULL_RETURN(node);
  1271. SET_NTYPE(node, NT_STR);
  1272. NSTR(node)->capa = 0;
  1273. NSTR(node)->flag = 0;
  1274. NSTR(node)->s = NSTR(node)->buf;
  1275. NSTR(node)->end = NSTR(node)->buf;
  1276. if (onig_node_str_cat(node, s, end)) {
  1277. onig_node_free(node);
  1278. return NULL;
  1279. }
  1280. return node;
  1281. }
  1282. extern Node*
  1283. onig_node_new_str(const UChar* s, const UChar* end)
  1284. {
  1285. return node_new_str(s, end);
  1286. }
  1287. static Node*
  1288. node_new_str_raw(UChar* s, UChar* end)
  1289. {
  1290. Node* node = node_new_str(s, end);
  1291. NSTRING_SET_RAW(node);
  1292. return node;
  1293. }
  1294. static Node*
  1295. node_new_empty(void)
  1296. {
  1297. return node_new_str(NULL, NULL);
  1298. }
  1299. static Node*
  1300. node_new_str_raw_char(UChar c)
  1301. {
  1302. UChar p[1];
  1303. p[0] = c;
  1304. return node_new_str_raw(p, p + 1);
  1305. }
  1306. static Node*
  1307. str_node_split_last_char(StrNode* sn, OnigEncoding enc)
  1308. {
  1309. const UChar *p;
  1310. Node* n = NULL_NODE;
  1311. if (sn->end > sn->s) {
  1312. p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
  1313. if (p && p > sn->s) { /* can be splitted. */
  1314. n = node_new_str(p, sn->end);
  1315. if ((sn->flag & NSTR_RAW) != 0)
  1316. NSTRING_SET_RAW(n);
  1317. sn->end = (UChar* )p;
  1318. }
  1319. }
  1320. return n;
  1321. }
  1322. static int
  1323. str_node_can_be_split(StrNode* sn, OnigEncoding enc)
  1324. {
  1325. if (sn->end > sn->s) {
  1326. return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
  1327. }
  1328. return 0;
  1329. }
  1330. #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
  1331. static int
  1332. node_str_head_pad(StrNode* sn, int num, UChar val)
  1333. {
  1334. UChar buf[NODE_STR_BUF_SIZE];
  1335. int i, len;
  1336. len = sn->end - sn->s;
  1337. onig_strcpy(buf, sn->s, sn->end);
  1338. onig_strcpy(&(sn->s[num]), buf, buf + len);
  1339. sn->end += num;
  1340. for (i = 0; i < num; i++) {
  1341. sn->s[i] = val;
  1342. }
  1343. }
  1344. #endif
  1345. extern int
  1346. onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
  1347. {
  1348. unsigned int num, val;
  1349. OnigCodePoint c;
  1350. UChar* p = *src;
  1351. PFETCH_READY;
  1352. num = 0;
  1353. while (!PEND) {
  1354. PFETCH(c);
  1355. if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
  1356. val = (unsigned int )DIGITVAL(c);
  1357. if ((INT_MAX_LIMIT - val) / 10UL < num)
  1358. return -1; /* overflow */
  1359. num = num * 10 + val;
  1360. }
  1361. else {
  1362. PUNFETCH;
  1363. break;
  1364. }
  1365. }
  1366. *src = p;
  1367. return num;
  1368. }
  1369. static int
  1370. scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
  1371. OnigEncoding enc)
  1372. {
  1373. OnigCodePoint c;
  1374. unsigned int num, val;
  1375. UChar* p = *src;
  1376. PFETCH_READY;
  1377. num = 0;
  1378. while (!PEND && maxlen-- != 0) {
  1379. PFETCH(c);
  1380. if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
  1381. val = (unsigned int )XDIGITVAL(enc,c);
  1382. if ((INT_MAX_LIMIT - val) / 16UL < num)
  1383. return -1; /* overflow */
  1384. num = (num << 4) + XDIGITVAL(enc,c);
  1385. }
  1386. else {
  1387. PUNFETCH;
  1388. break;
  1389. }
  1390. }
  1391. *src = p;
  1392. return num;
  1393. }
  1394. static int
  1395. scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
  1396. OnigEncoding enc)
  1397. {
  1398. OnigCodePoint c;
  1399. unsigned int num, val;
  1400. UChar* p = *src;
  1401. PFETCH_READY;
  1402. num = 0;
  1403. while (!PEND && maxlen-- != 0) {
  1404. PFETCH(c);
  1405. if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
  1406. val = ODIGITVAL(c);
  1407. if ((INT_MAX_LIMIT - val) / 8UL < num)
  1408. return -1; /* overflow */
  1409. num = (num << 3) + val;
  1410. }
  1411. else {
  1412. PUNFETCH;
  1413. break;
  1414. }
  1415. }
  1416. *src = p;
  1417. return num;
  1418. }
  1419. #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
  1420. BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
  1421. /* data format:
  1422. [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
  1423. (all data size is OnigCodePoint)
  1424. */
  1425. static int
  1426. new_code_range(BBuf** pbuf)
  1427. {
  1428. #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
  1429. int r;
  1430. OnigCodePoint n;
  1431. BBuf* bbuf;
  1432. bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
  1433. CHECK_NULL_RETURN_MEMERR(*pbuf);
  1434. r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
  1435. if (r) return r;
  1436. n = 0;
  1437. BBUF_WRITE_CODE_POINT(bbuf, 0, n);
  1438. return 0;
  1439. }
  1440. static int
  1441. add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
  1442. {
  1443. int r, inc_n, pos;
  1444. int low, high, bound, x;
  1445. OnigCodePoint n, *data;
  1446. BBuf* bbuf;
  1447. if (from > to) {
  1448. n = from; from = to; to = n;
  1449. }
  1450. if (IS_NULL(*pbuf)) {
  1451. r = new_code_range(pbuf);
  1452. if (r) return r;
  1453. bbuf = *pbuf;
  1454. n = 0;
  1455. }
  1456. else {
  1457. bbuf = *pbuf;
  1458. GET_CODE_POINT(n, bbuf->p);
  1459. }
  1460. data = (OnigCodePoint* )(bbuf->p);
  1461. data++;
  1462. for (low = 0, bound = n; low < bound; ) {
  1463. x = (low + bound) >> 1;
  1464. if (from > data[x*2 + 1])
  1465. low = x + 1;
  1466. else
  1467. bound = x;
  1468. }
  1469. for (high = low, bound = n; high < bound; ) {
  1470. x = (high + bound) >> 1;
  1471. if (to >= data[x*2] - 1)
  1472. high = x + 1;
  1473. else
  1474. bound = x;
  1475. }
  1476. inc_n = low + 1 - high;
  1477. if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
  1478. return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
  1479. if (inc_n != 1) {
  1480. if (from > data[low*2])
  1481. from = data[low*2];
  1482. if (to < data[(high - 1)*2 + 1])
  1483. to = data[(high - 1)*2 + 1];
  1484. }
  1485. if (inc_n != 0 && (OnigCodePoint )high < n) {
  1486. int from_pos = SIZE_CODE_POINT * (1 + high * 2);
  1487. int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
  1488. int size = (n - high) * 2 * SIZE_CODE_POINT;
  1489. if (inc_n > 0) {
  1490. BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
  1491. }
  1492. else {
  1493. BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
  1494. }
  1495. }
  1496. pos = SIZE_CODE_POINT * (1 + low * 2);
  1497. BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
  1498. BBUF_WRITE_CODE_POINT(bbuf, pos, from);
  1499. BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
  1500. n += inc_n;
  1501. BBUF_WRITE_CODE_POINT(bbuf, 0, n);
  1502. return 0;
  1503. }
  1504. static int
  1505. add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
  1506. {
  1507. if (from > to) {
  1508. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
  1509. return 0;
  1510. else
  1511. return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
  1512. }
  1513. return add_code_range_to_buf(pbuf, from, to);
  1514. }
  1515. static int
  1516. not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
  1517. {
  1518. int r, i, n;
  1519. OnigCodePoint pre, from, *data, to = 0;
  1520. *pbuf = (BBuf* )NULL;
  1521. if (IS_NULL(bbuf)) {
  1522. set_all:
  1523. return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
  1524. }
  1525. data = (OnigCodePoint* )(bbuf->p);
  1526. GET_CODE_POINT(n, data);
  1527. data++;
  1528. if (n <= 0) goto set_all;
  1529. r = 0;
  1530. pre = MBCODE_START_POS(enc);
  1531. for (i = 0; i < n; i++) {
  1532. from = data[i*2];
  1533. to = data[i*2+1];
  1534. if (pre <= from - 1) {
  1535. r = add_code_range_to_buf(pbuf, pre, from - 1);
  1536. if (r != 0) return r;
  1537. }
  1538. if (to == ~((OnigCodePoint )0)) break;
  1539. pre = to + 1;
  1540. }
  1541. if (to < ~((OnigCodePoint )0)) {
  1542. r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
  1543. }
  1544. return r;
  1545. }
  1546. #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
  1547. BBuf *tbuf; \
  1548. int tnot; \
  1549. tnot = not1; not1 = not2; not2 = tnot; \
  1550. tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
  1551. } while (0)
  1552. static int
  1553. or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
  1554. BBuf* bbuf2, int not2, BBuf** pbuf)
  1555. {
  1556. int r;
  1557. OnigCodePoint i, n1, *data1;
  1558. OnigCodePoint from, to;
  1559. *pbuf = (BBuf* )NULL;
  1560. if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
  1561. if (not1 != 0 || not2 != 0)
  1562. return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
  1563. return 0;
  1564. }
  1565. r = 0;
  1566. if (IS_NULL(bbuf2))
  1567. SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
  1568. if (IS_NULL(bbuf1)) {
  1569. if (not1 != 0) {
  1570. return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
  1571. }
  1572. else {
  1573. if (not2 == 0) {
  1574. return bbuf_clone(pbuf, bbuf2);
  1575. }
  1576. else {
  1577. return not_code_range_buf(enc, bbuf2, pbuf);
  1578. }
  1579. }
  1580. }
  1581. if (not1 != 0)
  1582. SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
  1583. data1 = (OnigCodePoint* )(bbuf1->p);
  1584. GET_CODE_POINT(n1, data1);
  1585. data1++;
  1586. if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
  1587. r = bbuf_clone(pbuf, bbuf2);
  1588. }
  1589. else if (not1 == 0) { /* 1 OR (not 2) */
  1590. r = not_code_range_buf(enc, bbuf2, pbuf);
  1591. }
  1592. if (r != 0) return r;
  1593. for (i = 0; i < n1; i++) {
  1594. from = data1[i*2];
  1595. to = data1[i*2+1];
  1596. r = add_code_range_to_buf(pbuf, from, to);
  1597. if (r != 0) return r;
  1598. }
  1599. return 0;
  1600. }
  1601. static int
  1602. and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
  1603. OnigCodePoint* data, int n)
  1604. {
  1605. int i, r;
  1606. OnigCodePoint from2, to2;
  1607. for (i = 0; i < n; i++) {
  1608. from2 = data[i*2];
  1609. to2 = data[i*2+1];
  1610. if (from2 < from1) {
  1611. if (to2 < from1) continue;
  1612. else {
  1613. from1 = to2 + 1;
  1614. }
  1615. }
  1616. else if (from2 <= to1) {
  1617. if (to2 < to1) {
  1618. if (from1 <= from2 - 1) {
  1619. r = add_code_range_to_buf(pbuf, from1, from2-1);
  1620. if (r != 0) return r;
  1621. }
  1622. from1 = to2 + 1;
  1623. }
  1624. else {
  1625. to1 = from2 - 1;
  1626. }
  1627. }
  1628. else {
  1629. from1 = from2;
  1630. }
  1631. if (from1 > to1) break;
  1632. }
  1633. if (from1 <= to1) {
  1634. r = add_code_range_to_buf(pbuf, from1, to1);
  1635. if (r != 0) return r;
  1636. }
  1637. return 0;
  1638. }
  1639. static int
  1640. and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
  1641. {
  1642. int r;
  1643. OnigCodePoint i, j, n1, n2, *data1, *data2;
  1644. OnigCodePoint from, to, from1, to1, from2, to2;
  1645. *pbuf = (BBuf* )NULL;
  1646. if (IS_NULL(bbuf1)) {
  1647. if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
  1648. return bbuf_clone(pbuf, bbuf2);
  1649. return 0;
  1650. }
  1651. else if (IS_NULL(bbuf2)) {
  1652. if (not2 != 0)
  1653. return bbuf_clone(pbuf, bbuf1);
  1654. return 0;
  1655. }
  1656. if (not1 != 0)
  1657. SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
  1658. data1 = (OnigCodePoint* )(bbuf1->p);
  1659. data2 = (OnigCodePoint* )(bbuf2->p);
  1660. GET_CODE_POINT(n1, data1);
  1661. GET_CODE_POINT(n2, data2);
  1662. data1++;
  1663. data2++;
  1664. if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
  1665. for (i = 0; i < n1; i++) {
  1666. from1 = data1[i*2];
  1667. to1 = data1[i*2+1];
  1668. for (j = 0; j < n2; j++) {
  1669. from2 = data2[j*2];
  1670. to2 = data2[j*2+1];
  1671. if (from2 > to1) break;
  1672. if (to2 < from1) continue;
  1673. from = MAX(from1, from2);
  1674. to = MIN(to1, to2);
  1675. r = add_code_range_to_buf(pbuf, from, to);
  1676. if (r != 0) return r;
  1677. }
  1678. }
  1679. }
  1680. else if (not1 == 0) { /* 1 AND (not 2) */
  1681. for (i = 0; i < n1; i++) {
  1682. from1 = data1[i*2];
  1683. to1 = data1[i*2+1];
  1684. r = and_code_range1(pbuf, from1, to1, data2, n2);
  1685. if (r != 0) return r;
  1686. }
  1687. }
  1688. return 0;
  1689. }
  1690. static int
  1691. and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
  1692. {
  1693. int r, not1, not2;
  1694. BBuf *buf1, *buf2, *pbuf;
  1695. BitSetRef bsr1, bsr2;
  1696. BitSet bs1, bs2;
  1697. not1 = IS_NCCLASS_NOT(dest);
  1698. bsr1 = dest->bs;
  1699. buf1 = dest->mbuf;
  1700. not2 = IS_NCCLASS_NOT(cc);
  1701. bsr2 = cc->bs;
  1702. buf2 = cc->mbuf;
  1703. if (not1 != 0) {
  1704. bitset_invert_to(bsr1, bs1);
  1705. bsr1 = bs1;
  1706. }
  1707. if (not2 != 0) {
  1708. bitset_invert_to(bsr2, bs2);
  1709. bsr2 = bs2;
  1710. }
  1711. bitset_and(bsr1, bsr2);
  1712. if (bsr1 != dest->bs) {
  1713. bitset_copy(dest->bs, bsr1);
  1714. bsr1 = dest->bs;
  1715. }
  1716. if (not1 != 0) {
  1717. bitset_invert(dest->bs);
  1718. }
  1719. if (! ONIGENC_IS_SINGLEBYTE(enc)) {
  1720. if (not1 != 0 && not2 != 0) {
  1721. r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
  1722. }
  1723. else {
  1724. r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
  1725. if (r == 0 && not1 != 0) {
  1726. BBuf *tbuf;
  1727. r = not_code_range_buf(enc, pbuf, &tbuf);
  1728. if (r != 0) {
  1729. bbuf_free(pbuf);
  1730. return r;
  1731. }
  1732. bbuf_free(pbuf);
  1733. pbuf = tbuf;
  1734. }
  1735. }
  1736. if (r != 0) return r;
  1737. dest->mbuf = pbuf;
  1738. bbuf_free(buf1);
  1739. return r;
  1740. }
  1741. return 0;
  1742. }
  1743. static int
  1744. or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
  1745. {
  1746. int r, not1, not2;
  1747. BBuf *buf1, *buf2, *pbuf;
  1748. BitSetRef bsr1, bsr2;
  1749. BitSet bs1, bs2;
  1750. not1 = IS_NCCLASS_NOT(dest);
  1751. bsr1 = dest->bs;
  1752. buf1 = dest->mbuf;
  1753. not2 = IS_NCCLASS_NOT(cc);
  1754. bsr2 = cc->bs;
  1755. buf2 = cc->mbuf;
  1756. if (not1 != 0) {
  1757. bitset_invert_to(bsr1, bs1);
  1758. bsr1 = bs1;
  1759. }
  1760. if (not2 != 0) {
  1761. bitset_invert_to(bsr2, bs2);
  1762. bsr2 = bs2;
  1763. }
  1764. bitset_or(bsr1, bsr2);
  1765. if (bsr1 != dest->bs) {
  1766. bitset_copy(dest->bs, bsr1);
  1767. bsr1 = dest->bs;
  1768. }
  1769. if (not1 != 0) {
  1770. bitset_invert(dest->bs);
  1771. }
  1772. if (! ONIGENC_IS_SINGLEBYTE(enc)) {
  1773. if (not1 != 0 && not2 != 0) {
  1774. r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
  1775. }
  1776. else {
  1777. r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
  1778. if (r == 0 && not1 != 0) {
  1779. BBuf *tbuf;
  1780. r = not_code_range_buf(enc, pbuf, &tbuf);
  1781. if (r != 0) {
  1782. bbuf_free(pbuf);
  1783. return r;
  1784. }
  1785. bbuf_free(pbuf);
  1786. pbuf = tbuf;
  1787. }
  1788. }
  1789. if (r != 0) return r;
  1790. dest->mbuf = pbuf;
  1791. bbuf_free(buf1);
  1792. return r;
  1793. }
  1794. else
  1795. return 0;
  1796. }
  1797. static int
  1798. conv_backslash_value(int c, ScanEnv* env)
  1799. {
  1800. if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
  1801. switch (c) {
  1802. case 'n': return '\n';
  1803. case 't': return '\t';
  1804. case 'r': return '\r';
  1805. case 'f': return '\f';
  1806. case 'a': return '\007';
  1807. case 'b': return '\010';
  1808. case 'e': return '\033';
  1809. case 'v':
  1810. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
  1811. return '\v';
  1812. break;
  1813. default:
  1814. break;
  1815. }
  1816. }
  1817. return c;
  1818. }
  1819. static int
  1820. is_invalid_quantifier_target(Node* node)
  1821. {
  1822. switch (NTYPE(node)) {
  1823. case NT_ANCHOR:
  1824. return 1;
  1825. break;
  1826. case NT_ENCLOSE:
  1827. /* allow enclosed elements */
  1828. /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
  1829. break;
  1830. case NT_LIST:
  1831. do {
  1832. if (! is_invalid_quantifier_target(NCAR(node))) return 0;
  1833. } while (IS_NOT_NULL(node = NCDR(node)));
  1834. return 0;
  1835. break;
  1836. case NT_ALT:
  1837. do {
  1838. if (is_invalid_quantifier_target(NCAR(node))) return 1;
  1839. } while (IS_NOT_NULL(node = NCDR(node)));
  1840. break;
  1841. default:
  1842. break;
  1843. }
  1844. return 0;
  1845. }
  1846. /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
  1847. static int
  1848. popular_quantifier_num(QtfrNode* q)
  1849. {
  1850. if (q->greedy) {
  1851. if (q->lower == 0) {
  1852. if (q->upper == 1) return 0;
  1853. else if (IS_REPEAT_INFINITE(q->upper)) return 1;
  1854. }
  1855. else if (q->lower == 1) {
  1856. if (IS_REPEAT_INFINITE(q->upper)) return 2;
  1857. }
  1858. }
  1859. else {
  1860. if (q->lower == 0) {
  1861. if (q->upper == 1) return 3;
  1862. else if (IS_REPEAT_INFINITE(q->upper)) return 4;
  1863. }
  1864. else if (q->lower == 1) {
  1865. if (IS_REPEAT_INFINITE(q->upper)) return 5;
  1866. }
  1867. }
  1868. return -1;
  1869. }
  1870. enum ReduceType {
  1871. RQ_ASIS = 0, /* as is */
  1872. RQ_DEL = 1, /* delete parent */
  1873. RQ_A, /* to '*' */
  1874. RQ_AQ, /* to '*?' */
  1875. RQ_QQ, /* to '??' */
  1876. RQ_P_QQ, /* to '+)??' */
  1877. RQ_PQ_Q /* to '+?)?' */
  1878. };
  1879. static enum ReduceType ReduceTypeTable[6][6] = {
  1880. {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
  1881. {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
  1882. {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
  1883. {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
  1884. {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
  1885. {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
  1886. };
  1887. extern void
  1888. onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
  1889. {
  1890. int pnum, cnum;
  1891. QtfrNode *p, *c;
  1892. p = NQTFR(pnode);
  1893. c = NQTFR(cnode);
  1894. pnum = popular_quantifier_num(p);
  1895. cnum = popular_quantifier_num(c);
  1896. if (pnum < 0 || cnum < 0) return ;
  1897. switch(ReduceTypeTable[cnum][pnum]) {
  1898. case RQ_DEL:
  1899. *pnode = *cnode;
  1900. break;
  1901. case RQ_A:
  1902. p->target = c->target;
  1903. p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
  1904. break;
  1905. case RQ_AQ:
  1906. p->target = c->target;
  1907. p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
  1908. break;
  1909. case RQ_QQ:
  1910. p->target = c->target;
  1911. p->lower = 0; p->upper = 1; p->greedy = 0;
  1912. break;
  1913. case RQ_P_QQ:
  1914. p->target = cnode;
  1915. p->lower = 0; p->upper = 1; p->greedy = 0;
  1916. c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
  1917. return ;
  1918. break;
  1919. case RQ_PQ_Q:
  1920. p->target = cnode;
  1921. p->lower = 0; p->upper = 1; p->greedy = 1;
  1922. c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
  1923. return ;
  1924. break;
  1925. case RQ_ASIS:
  1926. p->target = cnode;
  1927. return ;
  1928. break;
  1929. }
  1930. c->target = NULL_NODE;
  1931. onig_node_free(cnode);
  1932. }
  1933. enum TokenSyms {
  1934. TK_EOT = 0, /* end of token */
  1935. TK_RAW_BYTE = 1,
  1936. TK_CHAR,
  1937. TK_STRING,
  1938. TK_CODE_POINT,
  1939. TK_ANYCHAR,
  1940. TK_CHAR_TYPE,
  1941. TK_BACKREF,
  1942. TK_CALL,
  1943. TK_ANCHOR,
  1944. TK_OP_REPEAT,
  1945. TK_INTERVAL,
  1946. TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
  1947. TK_ALT,
  1948. TK_SUBEXP_OPEN,
  1949. TK_SUBEXP_CLOSE,
  1950. TK_CC_OPEN,
  1951. TK_QUOTE_OPEN,
  1952. TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
  1953. /* in cc */
  1954. TK_CC_CLOSE,
  1955. TK_CC_RANGE,
  1956. TK_POSIX_BRACKET_OPEN,
  1957. TK_CC_AND, /* && */
  1958. TK_CC_CC_OPEN /* [ */
  1959. };
  1960. typedef struct {
  1961. enum TokenSyms type;
  1962. int escaped;
  1963. int base; /* is number: 8, 16 (used in [....]) */
  1964. UChar* backp;
  1965. union {
  1966. UChar* s;
  1967. int c;
  1968. OnigCodePoint code;
  1969. int anchor;
  1970. int subtype;
  1971. struct {
  1972. int lower;
  1973. int upper;
  1974. int greedy;
  1975. int possessive;
  1976. } repeat;
  1977. struct {
  1978. int num;
  1979. int ref1;
  1980. int* refs;
  1981. int by_name;
  1982. #ifdef USE_BACKREF_WITH_LEVEL
  1983. int exist_level;
  1984. int level; /* \k<name+n> */
  1985. #endif
  1986. } backref;
  1987. struct {
  1988. UChar* name;
  1989. UChar* name_end;
  1990. int gnum;
  1991. } call;
  1992. struct {
  1993. int ctype;
  1994. int not;
  1995. } prop;
  1996. } u;
  1997. } OnigToken;
  1998. static int
  1999. fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
  2000. {
  2001. int low, up, syn_allow, non_low = 0;
  2002. int r = 0;
  2003. OnigCodePoint c;
  2004. OnigEncoding enc = env->enc;
  2005. UChar* p = *src;
  2006. PFETCH_READY;
  2007. syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
  2008. if (PEND) {
  2009. if (syn_allow)
  2010. return 1; /* "....{" : OK! */
  2011. else
  2012. return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
  2013. }
  2014. if (! syn_allow) {
  2015. c = PPEEK;
  2016. if (c == ')' || c == '(' || c == '|') {
  2017. return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
  2018. }
  2019. }
  2020. low = onig_scan_unsigned_number(&p, end, env->enc);
  2021. if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
  2022. if (low > ONIG_MAX_REPEAT_NUM)
  2023. return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
  2024. if (p == *src) { /* can't read low */
  2025. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
  2026. /* allow {,n} as {0,n} */
  2027. low = 0;
  2028. non_low = 1;
  2029. }
  2030. else
  2031. goto invalid;
  2032. }
  2033. if (PEND) goto invalid;
  2034. PFETCH(c);
  2035. if (c == ',') {
  2036. UChar* prev = p;
  2037. up = onig_scan_unsigned_number(&p, end, env->enc);
  2038. if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
  2039. if (up > ONIG_MAX_REPEAT_NUM)
  2040. return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
  2041. if (p == prev) {
  2042. if (non_low != 0)
  2043. goto invalid;
  2044. up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
  2045. }
  2046. }
  2047. else {
  2048. if (non_low != 0)
  2049. goto invalid;
  2050. PUNFETCH;
  2051. up = low; /* {n} : exact n times */
  2052. r = 2; /* fixed */
  2053. }
  2054. if (PEND) goto invalid;
  2055. PFETCH(c);
  2056. if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
  2057. if (c != MC_ESC(env->syntax)) goto invalid;
  2058. PFETCH(c);
  2059. }
  2060. if (c != '}') goto invalid;
  2061. if (!IS_REPEAT_INFINITE(up) && low > up) {
  2062. return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
  2063. }
  2064. tok->type = TK_INTERVAL;
  2065. tok->u.repeat.lower = low;
  2066. tok->u.repeat.upper = up;
  2067. *src = p;
  2068. return r; /* 0: normal {n,m}, 2: fixed {n} */
  2069. invalid:
  2070. if (syn_allow)
  2071. return 1; /* OK */
  2072. else
  2073. return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
  2074. }
  2075. /* \M-, \C-, \c, or \... */
  2076. static int
  2077. fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
  2078. {
  2079. int v;
  2080. OnigCodePoint c;
  2081. OnigEncoding enc = env->enc;
  2082. UChar* p = *src;
  2083. if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
  2084. PFETCH_S(c);
  2085. switch (c) {
  2086. case 'M':
  2087. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
  2088. if (PEND) return ONIGERR_END_PATTERN_AT_META;
  2089. PFETCH_S(c);
  2090. if (c != '-') return ONIGERR_META_CODE_SYNTAX;
  2091. if (PEND) return ONIGERR_END_PATTERN_AT_META;
  2092. PFETCH_S(c);
  2093. if (c == MC_ESC(env->syntax)) {
  2094. v = fetch_escaped_value(&p, end, env);
  2095. if (v < 0) return v;
  2096. c = (OnigCodePoint )v;
  2097. }
  2098. c = ((c & 0xff) | 0x80);
  2099. }
  2100. else
  2101. goto backslash;
  2102. break;
  2103. case 'C':
  2104. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
  2105. if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
  2106. PFETCH_S(c);
  2107. if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
  2108. goto control;
  2109. }
  2110. else
  2111. goto backslash;
  2112. case 'c':
  2113. if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
  2114. control:
  2115. if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
  2116. PFETCH_S(c);
  2117. if (c == '?') {
  2118. c = 0177;
  2119. }
  2120. else {
  2121. if (c == MC_ESC(env->syntax)) {
  2122. v = fetch_escaped_value(&p, end, env);
  2123. if (v < 0) return v;
  2124. c = (OnigCodePoint )v;
  2125. }
  2126. c &= 0x9f;
  2127. }
  2128. break;
  2129. }
  2130. /* fall through */
  2131. default:
  2132. {
  2133. backslash:
  2134. c = conv_backslash_value(c, env);
  2135. }
  2136. break;
  2137. }
  2138. *src = p;
  2139. return c;
  2140. }
  2141. static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
  2142. static OnigCodePoint
  2143. get_name_end_code_point(OnigCodePoint start)
  2144. {
  2145. switch (start) {
  2146. case '<': return (OnigCodePoint )'>'; break;
  2147. case '\'': return (OnigCodePoint )'\''; break;
  2148. default:
  2149. break;
  2150. }
  2151. return (OnigCodePoint )0;
  2152. }
  2153. #ifdef USE_NAMED_GROUP
  2154. #ifdef USE_BACKREF_WITH_LEVEL
  2155. /*
  2156. \k<name+n>, \k<name-n>
  2157. \k<num+n>, \k<num-n>
  2158. \k<-num+n>, \k<-num-n>
  2159. */
  2160. static int
  2161. fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
  2162. UChar** rname_end, ScanEnv* env,
  2163. int* rback_num, int* rlevel)
  2164. {
  2165. int r, sign, is_num, exist_level;
  2166. OnigCodePoint end_code;
  2167. OnigCodePoint c = 0;
  2168. OnigEncoding enc = env->enc;
  2169. UChar *name_end;
  2170. UChar *pnum_head;
  2171. UChar *p = *src;
  2172. PFETCH_READY;
  2173. *rback_num = 0;
  2174. is_num = exist_level = 0;
  2175. sign = 1;
  2176. pnum_head = *src;
  2177. end_code = get_name_end_code_point(start_code);
  2178. name_end = end;
  2179. r = 0;
  2180. if (PEND) {
  2181. return ONIGERR_EMPTY_GROUP_NAME;
  2182. }
  2183. else {
  2184. PFETCH(c);
  2185. if (c == end_code)
  2186. return ONIGERR_EMPTY_GROUP_NAME;
  2187. if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
  2188. is_num = 1;
  2189. }
  2190. else if (c == '-') {
  2191. is_num = 2;
  2192. sign = -1;
  2193. pnum_head = p;
  2194. }
  2195. else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
  2196. r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
  2197. }
  2198. }
  2199. while (!PEND) {
  2200. name_end = p;
  2201. PFETCH(c);
  2202. if (c == end_code || c == ')' || c == '+' || c == '-') {
  2203. if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
  2204. break;
  2205. }
  2206. if (is_num != 0) {
  2207. if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
  2208. is_num = 1;
  2209. }
  2210. else {
  2211. r = ONIGERR_INVALID_GROUP_NAME;
  2212. is_num = 0;
  2213. }
  2214. }
  2215. else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
  2216. r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
  2217. }
  2218. }
  2219. if (r == 0 && c != end_code) {
  2220. if (c == '+' || c == '-') {
  2221. int level;
  2222. int flag = (c == '-' ? -1 : 1);
  2223. PFETCH(c);
  2224. if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
  2225. PUNFETCH;
  2226. level = onig_scan_unsigned_number(&p, end, enc);
  2227. if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
  2228. *rlevel = (level * flag);
  2229. exist_level = 1;
  2230. PFETCH(c);
  2231. if (c == end_code)
  2232. goto end;
  2233. }
  2234. err:
  2235. r = ONIGERR_INVALID_GROUP_NAME;
  2236. name_end = end;
  2237. }
  2238. end:
  2239. if (r == 0) {
  2240. if (is_num != 0) {
  2241. *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
  2242. if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
  2243. else if (*rback_num == 0) goto err;
  2244. *rback_num *= sign;
  2245. }
  2246. *rname_end = name_end;
  2247. *src = p;
  2248. return (exist_level ? 1 : 0);
  2249. }
  2250. else {
  2251. onig_scan_env_set_error_string(env, r, *src, name_end);
  2252. return r;
  2253. }
  2254. }
  2255. #endif /* USE_BACKREF_WITH_LEVEL */
  2256. /*
  2257. def: 0 -> define name (don't allow number name)
  2258. 1 -> reference name (allow number name)
  2259. */
  2260. static int
  2261. fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
  2262. UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
  2263. {
  2264. int r, is_num, sign;
  2265. OnigCodePoint end_code;
  2266. OnigCodePoint c = 0;
  2267. OnigEncoding enc = env->enc;
  2268. UChar *name_end;
  2269. UChar *pnum_head;
  2270. UChar *p = *src;
  2271. *rback_num = 0;
  2272. end_code = get_name_end_code_point(start_code);
  2273. name_end = end;
  2274. pnum_head = *src;
  2275. r = 0;
  2276. is_num = 0;
  2277. sign = 1;
  2278. if (PEND) {
  2279. return ONIGERR_EMPTY_GROUP_NAME;
  2280. }
  2281. else {
  2282. PFETCH_S(c);
  2283. if (c == end_code)
  2284. return ONIGERR_EMPTY_GROUP_NAME;
  2285. if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
  2286. if (ref == 1)
  2287. is_num = 1;
  2288. else {
  2289. r = ONIGERR_INVALID_GROUP_NAME;
  2290. is_num = 0;
  2291. }
  2292. }
  2293. else if (c == '-') {
  2294. if (ref == 1) {
  2295. is_num = 2;
  2296. sign = -1;
  2297. pnum_head = p;
  2298. }
  2299. else {
  2300. r = ONIGERR_INVALID_GROUP_NAME;
  2301. is_num = 0;
  2302. }
  2303. }
  2304. else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
  2305. r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
  2306. }
  2307. }
  2308. if (r == 0) {
  2309. while (!PEND) {
  2310. name_end = p;
  2311. PFETCH_S(c);
  2312. if (c == end_code || c == ')') {
  2313. if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
  2314. break;
  2315. }
  2316. if (is_num != 0) {
  2317. if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
  2318. is_num = 1;
  2319. }
  2320. else {
  2321. if (!ONIGENC_IS_CODE_WORD(enc, c))
  2322. r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
  2323. else
  2324. r = ONIGERR_INVALID_GROUP_NAME;
  2325. is_num = 0;
  2326. }
  2327. }
  2328. else {
  2329. if (!ONIGENC_IS_CODE_WORD(enc, c)) {
  2330. r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
  2331. }
  2332. }
  2333. }
  2334. if (c != end_code) {
  2335. r = ONIGERR_INVALID_GROUP_NAME;
  2336. name_end = end;
  2337. }
  2338. if (is_num != 0) {
  2339. *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
  2340. if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
  2341. else if (*rback_num == 0) {
  2342. r = ONIGERR_INVALID_GROUP_NAME;
  2343. goto err;
  2344. }
  2345. *rback_num *= sign;
  2346. }
  2347. *rname_end = name_end;
  2348. *src = p;
  2349. return 0;
  2350. }
  2351. else {
  2352. while (!PEND) {
  2353. name_end = p;
  2354. PFETCH_S(c);
  2355. if (c == end_code || c == ')')
  2356. break;
  2357. }
  2358. if (PEND)
  2359. name_end = end;
  2360. err:
  2361. onig_scan_env_set_error_string(env, r, *src, name_end);
  2362. return r;
  2363. }
  2364. }
  2365. #else
  2366. static int
  2367. fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
  2368. UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
  2369. {
  2370. int r, is_num, sign;
  2371. OnigCodePoint end_code;
  2372. OnigCodePoint c = 0;
  2373. UChar *name_end;
  2374. OnigEncoding enc = env->enc;
  2375. UChar *pnum_head;
  2376. UChar *p = *src;
  2377. PFETCH_READY;
  2378. *rback_num = 0;
  2379. end_code = get_name_end_code_point(start_code);
  2380. *rname_end = name_end = end;
  2381. r = 0;
  2382. pnum_head = *src;
  2383. is_num = 0;
  2384. sign = 1;
  2385. if (PEND) {
  2386. return ONIGERR_EMPTY_GROUP_NAME;
  2387. }
  2388. else {
  2389. PFETCH(c);
  2390. if (c == end_code)
  2391. return ONIGERR_EMPTY_GROUP_NAME;
  2392. if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
  2393. is_num = 1;
  2394. }
  2395. else if (c == '-') {
  2396. is_num = 2;
  2397. sign = -1;
  2398. pnum_head = p;
  2399. }
  2400. else {
  2401. r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
  2402. }
  2403. }
  2404. while (!PEND) {
  2405. name_end = p;
  2406. PFETCH(c);
  2407. if (c == end_code || c == ')') break;
  2408. if (! ONIGENC_IS_CODE_DIGIT(enc, c))
  2409. r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
  2410. }
  2411. if (r == 0 && c != end_code) {
  2412. r = ONIGERR_INVALID_GROUP_NAME;
  2413. name_end = end;
  2414. }
  2415. if (r == 0) {
  2416. *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
  2417. if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
  2418. else if (*rback_num == 0) {
  2419. r = ONIGERR_INVALID_GROUP_NAME;
  2420. goto err;
  2421. }
  2422. *rback_num *= sign;
  2423. *rname_end = name_end;
  2424. *src = p;
  2425. return 0;
  2426. }
  2427. else {
  2428. err:
  2429. onig_scan_env_set_error_string(env, r, *src, name_end);
  2430. return r;
  2431. }
  2432. }
  2433. #endif /* USE_NAMED_GROUP */
  2434. static void
  2435. CC_ESC_WARN(ScanEnv* env, UChar *c)
  2436. {
  2437. if (onig_warn == onig_null_warn) return ;
  2438. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
  2439. IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
  2440. UChar buf[WARN_BUFSIZE];
  2441. onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
  2442. env->pattern, env->pattern_end,
  2443. (UChar* )"character class has '%s' without escape", c);
  2444. (*onig_warn)((char* )buf);
  2445. }
  2446. }
  2447. static void
  2448. CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
  2449. {
  2450. if (onig_warn == onig_null_warn) return ;
  2451. if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
  2452. UChar buf[WARN_BUFSIZE];
  2453. onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
  2454. (env)->pattern, (env)->pattern_end,
  2455. (UChar* )"regular expression has '%s' without escape", c);
  2456. (*onig_warn)((char* )buf);
  2457. }
  2458. }
  2459. static UChar*
  2460. find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
  2461. UChar **next, OnigEncoding enc)
  2462. {
  2463. int i;
  2464. OnigCodePoint x;
  2465. UChar *q;
  2466. UChar *p = from;
  2467. while (p < to) {
  2468. x = ONIGENC_MBC_TO_CODE(enc, p, to);
  2469. q = p + enclen(enc, p);
  2470. if (x == s[0]) {
  2471. for (i = 1; i < n && q < to; i++) {
  2472. x = ONIGENC_MBC_TO_CODE(enc, q, to);
  2473. if (x != s[i]) break;
  2474. q += enclen(enc, q);
  2475. }
  2476. if (i >= n) {
  2477. if (IS_NOT_NULL(next))
  2478. *next = q;
  2479. return p;
  2480. }
  2481. }
  2482. p = q;
  2483. }
  2484. return NULL_UCHARP;
  2485. }
  2486. static int
  2487. str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
  2488. OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
  2489. {
  2490. int i, in_esc;
  2491. OnigCodePoint x;
  2492. UChar *q;
  2493. UChar *p = from;
  2494. in_esc = 0;
  2495. while (p < to) {
  2496. if (in_esc) {
  2497. in_esc = 0;
  2498. p += enclen(enc, p);
  2499. }
  2500. else {
  2501. x = ONIGENC_MBC_TO_CODE(enc, p, to);
  2502. q = p + enclen(enc, p);
  2503. if (x == s[0]) {
  2504. for (i = 1; i < n && q < to; i++) {
  2505. x = ONIGENC_MBC_TO_CODE(enc, q, to);
  2506. if (x != s[i]) break;
  2507. q += enclen(enc, q);
  2508. }
  2509. if (i >= n) return 1;
  2510. p += enclen(enc, p);
  2511. }
  2512. else {
  2513. x = ONIGENC_MBC_TO_CODE(enc, p, to);
  2514. if (x == bad) return 0;
  2515. else if (x == MC_ESC(syn)) in_esc = 1;
  2516. p = q;
  2517. }
  2518. }
  2519. }
  2520. return 0;
  2521. }
  2522. static int
  2523. fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
  2524. {
  2525. int num;
  2526. OnigCodePoint c, c2;
  2527. OnigSyntaxType* syn = env->syntax;
  2528. OnigEncoding enc = env->enc;
  2529. UChar* prev;
  2530. UChar* p = *src;
  2531. PFETCH_READY;
  2532. if (PEND) {
  2533. tok->type = TK_EOT;
  2534. return tok->type;
  2535. }
  2536. PFETCH(c);
  2537. tok->type = TK_CHAR;
  2538. tok->base = 0;
  2539. tok->u.c = c;
  2540. tok->escaped = 0;
  2541. if (c == ']') {
  2542. tok->type = TK_CC_CLOSE;
  2543. }
  2544. else if (c == '-') {
  2545. tok->type = TK_CC_RANGE;
  2546. }
  2547. else if (c == MC_ESC(syn)) {
  2548. if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
  2549. goto end;
  2550. if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
  2551. PFETCH(c);
  2552. tok->escaped = 1;
  2553. tok->u.c = c;
  2554. switch (c) {
  2555. case 'w':
  2556. tok->type = TK_CHAR_TYPE;
  2557. tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
  2558. tok->u.prop.not = 0;
  2559. break;
  2560. case 'W':
  2561. tok->type = TK_CHAR_TYPE;
  2562. tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
  2563. tok->u.prop.not = 1;
  2564. break;
  2565. case 'd':
  2566. tok->type = TK_CHAR_TYPE;
  2567. tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
  2568. tok->u.prop.not = 0;
  2569. break;
  2570. case 'D':
  2571. tok->type = TK_CHAR_TYPE;
  2572. tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
  2573. tok->u.prop.not = 1;
  2574. break;
  2575. case 's':
  2576. tok->type = TK_CHAR_TYPE;
  2577. tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
  2578. tok->u.prop.not = 0;
  2579. break;
  2580. case 'S':
  2581. tok->type = TK_CHAR_TYPE;
  2582. tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
  2583. tok->u.prop.not = 1;
  2584. break;
  2585. case 'h':
  2586. if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
  2587. tok->type = TK_CHAR_TYPE;
  2588. tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
  2589. tok->u.prop.not = 0;
  2590. break;
  2591. case 'H':
  2592. if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
  2593. tok->type = TK_CHAR_TYPE;
  2594. tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
  2595. tok->u.prop.not = 1;
  2596. break;
  2597. case 'p':
  2598. case 'P':
  2599. c2 = PPEEK;
  2600. if (c2 == '{' &&
  2601. IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
  2602. PINC;
  2603. tok->type = TK_CHAR_PROPERTY;
  2604. tok->u.prop.not = (c == 'P' ? 1 : 0);
  2605. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
  2606. PFETCH(c2);
  2607. if (c2 == '^') {
  2608. tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
  2609. }
  2610. else
  2611. PUNFETCH;
  2612. }
  2613. }
  2614. break;
  2615. case 'x':
  2616. if (PEND) break;
  2617. prev = p;
  2618. if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
  2619. PINC;
  2620. num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
  2621. if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
  2622. if (!PEND) {
  2623. c2 = PPEEK;
  2624. if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
  2625. return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
  2626. }
  2627. if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
  2628. PINC;
  2629. tok->type = TK_CODE_POINT;
  2630. tok->base = 16;
  2631. tok->u.code = (OnigCodePoint )num;
  2632. }
  2633. else {
  2634. /* can't read nothing or invalid format */
  2635. p = prev;
  2636. }
  2637. }
  2638. else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
  2639. num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
  2640. if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
  2641. if (p == prev) { /* can't read nothing. */
  2642. num = 0; /* but, it's not error */
  2643. }
  2644. tok->type = TK_RAW_BYTE;
  2645. tok->base = 16;
  2646. tok->u.c = num;
  2647. }
  2648. break;
  2649. case 'u':
  2650. if (PEND) break;
  2651. prev = p;
  2652. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
  2653. num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
  2654. if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
  2655. if (p == prev) { /* can't read nothing. */
  2656. num = 0; /* but, it's not error */
  2657. }
  2658. tok->type = TK_CODE_POINT;
  2659. tok->base = 16;
  2660. tok->u.code = (OnigCodePoint )num;
  2661. }
  2662. break;
  2663. case '0':
  2664. case '1': case '2': case '3': case '4': case '5': case '6': case '7':
  2665. if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
  2666. PUNFETCH;
  2667. prev = p;
  2668. num = scan_unsigned_octal_number(&p, end, 3, enc);
  2669. if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
  2670. if (p == prev) { /* can't read nothing. */
  2671. num = 0; /* but, it's not error */
  2672. }
  2673. tok->type = TK_RAW_BYTE;
  2674. tok->base = 8;
  2675. tok->u.c = num;
  2676. }
  2677. break;
  2678. default:
  2679. PUNFETCH;
  2680. num = fetch_escaped_value(&p, end, env);
  2681. if (num < 0) return num;
  2682. if (tok->u.c != num) {
  2683. tok->u.code = (OnigCodePoint )num;
  2684. tok->type = TK_CODE_POINT;
  2685. }
  2686. break;
  2687. }
  2688. }
  2689. else if (c == '[') {
  2690. if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
  2691. OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
  2692. tok->backp = p; /* point at '[' is readed */
  2693. PINC;
  2694. if (str_exist_check_with_esc(send, 2, p, end,
  2695. (OnigCodePoint )']', enc, syn)) {
  2696. tok->type = TK_POSIX_BRACKET_OPEN;
  2697. }
  2698. else {
  2699. PUNFETCH;
  2700. goto cc_in_cc;
  2701. }
  2702. }
  2703. else {
  2704. cc_in_cc:
  2705. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
  2706. tok->type = TK_CC_CC_OPEN;
  2707. }
  2708. else {
  2709. CC_ESC_WARN(env, (UChar* )"[");
  2710. }
  2711. }
  2712. }
  2713. else if (c == '&') {
  2714. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
  2715. !PEND && (PPEEK_IS('&'))) {
  2716. PINC;
  2717. tok->type = TK_CC_AND;
  2718. }
  2719. }
  2720. end:
  2721. *src = p;
  2722. return tok->type;
  2723. }
  2724. static int
  2725. fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
  2726. {
  2727. int r, num;
  2728. OnigCodePoint c;
  2729. OnigEncoding enc = env->enc;
  2730. OnigSyntaxType* syn = env->syntax;
  2731. UChar* prev;
  2732. UChar* p = *src;
  2733. PFETCH_READY;
  2734. start:
  2735. if (PEND) {
  2736. tok->type = TK_EOT;
  2737. return tok->type;
  2738. }
  2739. tok->type = TK_STRING;
  2740. tok->base = 0;
  2741. tok->backp = p;
  2742. PFETCH(c);
  2743. if (IS_MC_ESC_CODE(c, syn)) {
  2744. if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
  2745. tok->backp = p;
  2746. PFETCH(c);
  2747. tok->u.c = c;
  2748. tok->escaped = 1;
  2749. switch (c) {
  2750. case '*':
  2751. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
  2752. tok->type = TK_OP_REPEAT;
  2753. tok->u.repeat.lower = 0;
  2754. tok->u.repeat.upper = REPEAT_INFINITE;
  2755. goto greedy_check;
  2756. break;
  2757. case '+':
  2758. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
  2759. tok->type = TK_OP_REPEAT;
  2760. tok->u.repeat.lower = 1;
  2761. tok->u.repeat.upper = REPEAT_INFINITE;
  2762. goto greedy_check;
  2763. break;
  2764. case '?':
  2765. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
  2766. tok->type = TK_OP_REPEAT;
  2767. tok->u.repeat.lower = 0;
  2768. tok->u.repeat.upper = 1;
  2769. greedy_check:
  2770. if (!PEND && PPEEK_IS('?') &&
  2771. IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
  2772. PFETCH(c);
  2773. tok->u.repeat.greedy = 0;
  2774. tok->u.repeat.possessive = 0;
  2775. }
  2776. else {
  2777. possessive_check:
  2778. if (!PEND && PPEEK_IS('+') &&
  2779. ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
  2780. tok->type != TK_INTERVAL) ||
  2781. (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
  2782. tok->type == TK_INTERVAL))) {
  2783. PFETCH(c);
  2784. tok->u.repeat.greedy = 1;
  2785. tok->u.repeat.possessive = 1;
  2786. }
  2787. else {
  2788. tok->u.repeat.greedy = 1;
  2789. tok->u.repeat.possessive = 0;
  2790. }
  2791. }
  2792. break;
  2793. case '{':
  2794. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
  2795. r = fetch_range_quantifier(&p, end, tok, env);
  2796. if (r < 0) return r; /* error */
  2797. if (r == 0) goto greedy_check;
  2798. else if (r == 2) { /* {n} */
  2799. if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
  2800. goto possessive_check;
  2801. goto greedy_check;
  2802. }
  2803. /* r == 1 : normal char */
  2804. break;
  2805. case '|':
  2806. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
  2807. tok->type = TK_ALT;
  2808. break;
  2809. case '(':
  2810. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
  2811. tok->type = TK_SUBEXP_OPEN;
  2812. break;
  2813. case ')':
  2814. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
  2815. tok->type = TK_SUBEXP_CLOSE;
  2816. break;
  2817. case 'w':
  2818. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
  2819. tok->type = TK_CHAR_TYPE;
  2820. tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
  2821. tok->u.prop.not = 0;
  2822. break;
  2823. case 'W':
  2824. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
  2825. tok->type = TK_CHAR_TYPE;
  2826. tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
  2827. tok->u.prop.not = 1;
  2828. break;
  2829. case 'b':
  2830. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
  2831. tok->type = TK_ANCHOR;
  2832. tok->u.anchor = ANCHOR_WORD_BOUND;
  2833. break;
  2834. case 'B':
  2835. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
  2836. tok->type = TK_ANCHOR;
  2837. tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
  2838. break;
  2839. #ifdef USE_WORD_BEGIN_END
  2840. case '<':
  2841. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
  2842. tok->type = TK_ANCHOR;
  2843. tok->u.anchor = ANCHOR_WORD_BEGIN;
  2844. break;
  2845. case '>':
  2846. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
  2847. tok->type = TK_ANCHOR;
  2848. tok->u.anchor = ANCHOR_WORD_END;
  2849. break;
  2850. #endif
  2851. case 's':
  2852. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
  2853. tok->type = TK_CHAR_TYPE;
  2854. tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
  2855. tok->u.prop.not = 0;
  2856. break;
  2857. case 'S':
  2858. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
  2859. tok->type = TK_CHAR_TYPE;
  2860. tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
  2861. tok->u.prop.not = 1;
  2862. break;
  2863. case 'd':
  2864. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
  2865. tok->type = TK_CHAR_TYPE;
  2866. tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
  2867. tok->u.prop.not = 0;
  2868. break;
  2869. case 'D':
  2870. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
  2871. tok->type = TK_CHAR_TYPE;
  2872. tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
  2873. tok->u.prop.not = 1;
  2874. break;
  2875. case 'h':
  2876. if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
  2877. tok->type = TK_CHAR_TYPE;
  2878. tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
  2879. tok->u.prop.not = 0;
  2880. break;
  2881. case 'H':
  2882. if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
  2883. tok->type = TK_CHAR_TYPE;
  2884. tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
  2885. tok->u.prop.not = 1;
  2886. break;
  2887. case 'A':
  2888. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
  2889. begin_buf:
  2890. tok->type = TK_ANCHOR;
  2891. tok->u.subtype = ANCHOR_BEGIN_BUF;
  2892. break;
  2893. case 'Z':
  2894. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
  2895. tok->type = TK_ANCHOR;
  2896. tok->u.subtype = ANCHOR_SEMI_END_BUF;
  2897. break;
  2898. case 'z':
  2899. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
  2900. end_buf:
  2901. tok->type = TK_ANCHOR;
  2902. tok->u.subtype = ANCHOR_END_BUF;
  2903. break;
  2904. case 'G':
  2905. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
  2906. tok->type = TK_ANCHOR;
  2907. tok->u.subtype = ANCHOR_BEGIN_POSITION;
  2908. break;
  2909. case '`':
  2910. if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
  2911. goto begin_buf;
  2912. break;
  2913. case '\'':
  2914. if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
  2915. goto end_buf;
  2916. break;
  2917. case 'x':
  2918. if (PEND) break;
  2919. prev = p;
  2920. if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
  2921. PINC;
  2922. num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
  2923. if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
  2924. if (!PEND) {
  2925. if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
  2926. return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
  2927. }
  2928. if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
  2929. PINC;
  2930. tok->type = TK_CODE_POINT;
  2931. tok->u.code = (OnigCodePoint )num;
  2932. }
  2933. else {
  2934. /* can't read nothing or invalid format */
  2935. p = prev;
  2936. }
  2937. }
  2938. else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
  2939. num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
  2940. if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
  2941. if (p == prev) { /* can't read nothing. */
  2942. num = 0; /* but, it's not error */
  2943. }
  2944. tok->type = TK_RAW_BYTE;
  2945. tok->base = 16;
  2946. tok->u.c = num;
  2947. }
  2948. break;
  2949. case 'u':
  2950. if (PEND) break;
  2951. prev = p;
  2952. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
  2953. num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
  2954. if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
  2955. if (p == prev) { /* can't read nothing. */
  2956. num = 0; /* but, it's not error */
  2957. }
  2958. tok->type = TK_CODE_POINT;
  2959. tok->base = 16;
  2960. tok->u.code = (OnigCodePoint )num;
  2961. }
  2962. break;
  2963. case '1': case '2': case '3': case '4':
  2964. case '5': case '6': case '7': case '8': case '9':
  2965. PUNFETCH;
  2966. prev = p;
  2967. num = onig_scan_unsigned_number(&p, end, enc);
  2968. if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
  2969. goto skip_backref;
  2970. }
  2971. if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
  2972. (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
  2973. if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
  2974. if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
  2975. return ONIGERR_INVALID_BACKREF;
  2976. }
  2977. tok->type = TK_BACKREF;
  2978. tok->u.backref.num = 1;
  2979. tok->u.backref.ref1 = num;
  2980. tok->u.backref.by_name = 0;
  2981. #ifdef USE_BACKREF_WITH_LEVEL
  2982. tok->u.backref.exist_level = 0;
  2983. #endif
  2984. break;
  2985. }
  2986. skip_backref:
  2987. if (c == '8' || c == '9') {
  2988. /* normal char */
  2989. p = prev; PINC;
  2990. break;
  2991. }
  2992. p = prev;
  2993. /* fall through */
  2994. case '0':
  2995. if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
  2996. prev = p;
  2997. num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
  2998. if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
  2999. if (p == prev) { /* can't read nothing. */
  3000. num = 0; /* but, it's not error */
  3001. }
  3002. tok->type = TK_RAW_BYTE;
  3003. tok->base = 8;
  3004. tok->u.c = num;
  3005. }
  3006. else if (c != '0') {
  3007. PINC;
  3008. }
  3009. break;
  3010. #ifdef USE_NAMED_GROUP
  3011. case 'k':
  3012. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
  3013. PFETCH(c);
  3014. if (c == '<' || c == '\'') {
  3015. UChar* name_end;
  3016. int* backs;
  3017. int back_num;
  3018. prev = p;
  3019. #ifdef USE_BACKREF_WITH_LEVEL
  3020. name_end = NULL_UCHARP; /* no need. escape gcc warning. */
  3021. r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
  3022. env, &back_num, &tok->u.backref.level);
  3023. if (r == 1) tok->u.backref.exist_level = 1;
  3024. else tok->u.backref.exist_level = 0;
  3025. #else
  3026. r = fetch_name(&p, end, &name_end, env, &back_num, 1);
  3027. #endif
  3028. if (r < 0) return r;
  3029. if (back_num != 0) {
  3030. if (back_num < 0) {
  3031. back_num = BACKREF_REL_TO_ABS(back_num, env);
  3032. if (back_num <= 0)
  3033. return ONIGERR_INVALID_BACKREF;
  3034. }
  3035. if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
  3036. if (back_num > env->num_mem ||
  3037. IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
  3038. return ONIGERR_INVALID_BACKREF;
  3039. }
  3040. tok->type = TK_BACKREF;
  3041. tok->u.backref.by_name = 0;
  3042. tok->u.backref.num = 1;
  3043. tok->u.backref.ref1 = back_num;
  3044. }
  3045. else {
  3046. num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
  3047. if (num <= 0) {
  3048. onig_scan_env_set_error_string(env,
  3049. ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
  3050. return ONIGERR_UNDEFINED_NAME_REFERENCE;
  3051. }
  3052. if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
  3053. int i;
  3054. for (i = 0; i < num; i++) {
  3055. if (backs[i] > env->num_mem ||
  3056. IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
  3057. return ONIGERR_INVALID_BACKREF;
  3058. }
  3059. }
  3060. tok->type = TK_BACKREF;
  3061. tok->u.backref.by_name = 1;
  3062. if (num == 1) {
  3063. tok->u.backref.num = 1;
  3064. tok->u.backref.ref1 = backs[0];
  3065. }
  3066. else {
  3067. tok->u.backref.num = num;
  3068. tok->u.backref.refs = backs;
  3069. }
  3070. }
  3071. }
  3072. else
  3073. PUNFETCH;
  3074. }
  3075. break;
  3076. #endif
  3077. #ifdef USE_SUBEXP_CALL
  3078. case 'g':
  3079. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
  3080. PFETCH(c);
  3081. if (c == '<' || c == '\'') {
  3082. int gnum;
  3083. UChar* name_end;
  3084. prev = p;
  3085. r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
  3086. if (r < 0) return r;
  3087. tok->type = TK_CALL;
  3088. tok->u.call.name = prev;
  3089. tok->u.call.name_end = name_end;
  3090. tok->u.call.gnum = gnum;
  3091. }
  3092. else
  3093. PUNFETCH;
  3094. }
  3095. break;
  3096. #endif
  3097. case 'Q':
  3098. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
  3099. tok->type = TK_QUOTE_OPEN;
  3100. }
  3101. break;
  3102. case 'p':
  3103. case 'P':
  3104. if (PPEEK_IS('{') &&
  3105. IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
  3106. PINC;
  3107. tok->type = TK_CHAR_PROPERTY;
  3108. tok->u.prop.not = (c == 'P' ? 1 : 0);
  3109. if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
  3110. PFETCH(c);
  3111. if (c == '^') {
  3112. tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
  3113. }
  3114. else
  3115. PUNFETCH;
  3116. }
  3117. }
  3118. break;
  3119. default:
  3120. PUNFETCH;
  3121. num = fetch_escaped_value(&p, end, env);
  3122. if (num < 0) return num;
  3123. /* set_raw: */
  3124. if (tok->u.c != num) {
  3125. tok->type = TK_CODE_POINT;
  3126. tok->u.code = (OnigCodePoint )num;
  3127. }
  3128. else { /* string */
  3129. int len;
  3130. SAFE_ENC_LEN(enc, tok->backp, end, len);
  3131. p = tok->backp + len;
  3132. }
  3133. break;
  3134. }
  3135. }
  3136. else {
  3137. tok->u.c = c;
  3138. tok->escaped = 0;
  3139. #ifdef USE_VARIABLE_META_CHARS
  3140. if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
  3141. IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
  3142. if (c == MC_ANYCHAR(syn))
  3143. goto any_char;
  3144. else if (c == MC_ANYTIME(syn))
  3145. goto anytime;
  3146. else if (c == MC_ZERO_OR_ONE_TIME(syn))
  3147. goto zero_or_one_time;
  3148. else if (c == MC_ONE_OR_MORE_TIME(syn))
  3149. goto one_or_more_time;
  3150. else if (c == MC_ANYCHAR_ANYTIME(syn)) {
  3151. tok->type = TK_ANYCHAR_ANYTIME;
  3152. goto out;
  3153. }
  3154. }
  3155. #endif
  3156. switch (c) {
  3157. case '.':
  3158. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
  3159. #ifdef USE_VARIABLE_META_CHARS
  3160. any_char:
  3161. #endif
  3162. tok->type = TK_ANYCHAR;
  3163. break;
  3164. case '*':
  3165. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
  3166. #ifdef USE_VARIABLE_META_CHARS
  3167. anytime:
  3168. #endif
  3169. tok->type = TK_OP_REPEAT;
  3170. tok->u.repeat.lower = 0;
  3171. tok->u.repeat.upper = REPEAT_INFINITE;
  3172. goto greedy_check;
  3173. break;
  3174. case '+':
  3175. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
  3176. #ifdef USE_VARIABLE_META_CHARS
  3177. one_or_more_time:
  3178. #endif
  3179. tok->type = TK_OP_REPEAT;
  3180. tok->u.repeat.lower = 1;
  3181. tok->u.repeat.upper = REPEAT_INFINITE;
  3182. goto greedy_check;
  3183. break;
  3184. case '?':
  3185. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
  3186. #ifdef USE_VARIABLE_META_CHARS
  3187. zero_or_one_time:
  3188. #endif
  3189. tok->type = TK_OP_REPEAT;
  3190. tok->u.repeat.lower = 0;
  3191. tok->u.repeat.upper = 1;
  3192. goto greedy_check;
  3193. break;
  3194. case '{':
  3195. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
  3196. r = fetch_range_quantifier(&p, end, tok, env);
  3197. if (r < 0) return r; /* error */
  3198. if (r == 0) goto greedy_check;
  3199. else if (r == 2) { /* {n} */
  3200. if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
  3201. goto possessive_check;
  3202. goto greedy_check;
  3203. }
  3204. /* r == 1 : normal char */
  3205. break;
  3206. case '|':
  3207. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
  3208. tok->type = TK_ALT;
  3209. break;
  3210. case '(':
  3211. if (PPEEK_IS('?') &&
  3212. IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
  3213. PINC;
  3214. if (PPEEK_IS('#')) {
  3215. PFETCH(c);
  3216. while (1) {
  3217. if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
  3218. PFETCH(c);
  3219. if (c == MC_ESC(syn)) {
  3220. if (!PEND) PFETCH(c);
  3221. }
  3222. else {
  3223. if (c == ')') break;
  3224. }
  3225. }
  3226. goto start;
  3227. }
  3228. PUNFETCH;
  3229. }
  3230. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
  3231. tok->type = TK_SUBEXP_OPEN;
  3232. break;
  3233. case ')':
  3234. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
  3235. tok->type = TK_SUBEXP_CLOSE;
  3236. break;
  3237. case '^':
  3238. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
  3239. tok->type = TK_ANCHOR;
  3240. tok->u.subtype = (IS_SINGLELINE(env->option)
  3241. ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
  3242. break;
  3243. case '$':
  3244. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
  3245. tok->type = TK_ANCHOR;
  3246. tok->u.subtype = (IS_SINGLELINE(env->option)
  3247. ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
  3248. break;
  3249. case '[':
  3250. if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
  3251. tok->type = TK_CC_OPEN;
  3252. break;
  3253. case ']':
  3254. if (*src > env->pattern) /* /].../ is allowed. */
  3255. CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
  3256. break;
  3257. case '#':
  3258. if (IS_EXTEND(env->option)) {
  3259. while (!PEND) {
  3260. PFETCH(c);
  3261. if (ONIGENC_IS_CODE_NEWLINE(enc, c))
  3262. break;
  3263. }
  3264. goto start;
  3265. break;
  3266. }
  3267. break;
  3268. case ' ': case '\t': case '\n': case '\r': case '\f':
  3269. if (IS_EXTEND(env->option))
  3270. goto start;
  3271. break;
  3272. default:
  3273. /* string */
  3274. break;
  3275. }
  3276. }
  3277. #ifdef USE_VARIABLE_META_CHARS
  3278. out:
  3279. #endif
  3280. *src = p;
  3281. return tok->type;
  3282. }
  3283. static int
  3284. add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
  3285. OnigEncoding enc ARG_UNUSED,
  3286. OnigCodePoint sb_out, const OnigCodePoint mbr[])
  3287. {
  3288. int i, r;
  3289. OnigCodePoint j;
  3290. int n = ONIGENC_CODE_RANGE_NUM(mbr);
  3291. if (not == 0) {
  3292. for (i = 0; i < n; i++) {
  3293. for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
  3294. j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
  3295. if (j >= sb_out) {
  3296. if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
  3297. else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
  3298. r = add_code_range_to_buf(&(cc->mbuf), j,
  3299. ONIGENC_CODE_RANGE_TO(mbr, i));
  3300. if (r != 0) return r;
  3301. i++;
  3302. }
  3303. goto sb_end;
  3304. }
  3305. BITSET_SET_BIT(cc->bs, j);
  3306. }
  3307. }
  3308. sb_end:
  3309. for ( ; i < n; i++) {
  3310. r = add_code_range_to_buf(&(cc->mbuf),
  3311. ONIGENC_CODE_RANGE_FROM(mbr, i),
  3312. ONIGENC_CODE_RANGE_TO(mbr, i));
  3313. if (r != 0) return r;
  3314. }
  3315. }
  3316. else {
  3317. OnigCodePoint prev = 0;
  3318. for (i = 0; i < n; i++) {
  3319. for (j = prev;
  3320. j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
  3321. if (j >= sb_out) {
  3322. goto sb_end2;
  3323. }
  3324. BITSET_SET_BIT(cc->bs, j);
  3325. }
  3326. prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
  3327. }
  3328. for (j = prev; j < sb_out; j++) {
  3329. BITSET_SET_BIT(cc->bs, j);
  3330. }
  3331. sb_end2:
  3332. prev = sb_out;
  3333. for (i = 0; i < n; i++) {
  3334. if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
  3335. r = add_code_range_to_buf(&(cc->mbuf), prev,
  3336. ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
  3337. if (r != 0) return r;
  3338. }
  3339. prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
  3340. }
  3341. if (prev < 0x7fffffff) {
  3342. r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
  3343. if (r != 0) return r;
  3344. }
  3345. }
  3346. return 0;
  3347. }
  3348. static int
  3349. add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
  3350. {
  3351. int c, r;
  3352. const OnigCodePoint *ranges;
  3353. OnigCodePoint sb_out;
  3354. OnigEncoding enc = env->enc;
  3355. r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
  3356. if (r == 0) {
  3357. return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
  3358. }
  3359. else if (r != ONIG_NO_SUPPORT_CONFIG) {
  3360. return r;
  3361. }
  3362. r = 0;
  3363. switch (ctype) {
  3364. case ONIGENC_CTYPE_ALPHA:
  3365. case ONIGENC_CTYPE_BLANK:
  3366. case ONIGENC_CTYPE_CNTRL:
  3367. case ONIGENC_CTYPE_DIGIT:
  3368. case ONIGENC_CTYPE_LOWER:
  3369. case ONIGENC_CTYPE_PUNCT:
  3370. case ONIGENC_CTYPE_SPACE:
  3371. case ONIGENC_CTYPE_UPPER:
  3372. case ONIGENC_CTYPE_XDIGIT:
  3373. case ONIGENC_CTYPE_ASCII:
  3374. case ONIGENC_CTYPE_ALNUM:
  3375. if (not != 0) {
  3376. for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
  3377. if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
  3378. BITSET_SET_BIT(cc->bs, c);
  3379. }
  3380. ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
  3381. }
  3382. else {
  3383. for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
  3384. if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
  3385. BITSET_SET_BIT(cc->bs, c);
  3386. }
  3387. }
  3388. break;
  3389. case ONIGENC_CTYPE_GRAPH:
  3390. case ONIGENC_CTYPE_PRINT:
  3391. if (not != 0) {
  3392. for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
  3393. if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
  3394. BITSET_SET_BIT(cc->bs, c);
  3395. }
  3396. }
  3397. else {
  3398. for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
  3399. if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
  3400. BITSET_SET_BIT(cc->bs, c);
  3401. }
  3402. ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
  3403. }
  3404. break;
  3405. case ONIGENC_CTYPE_WORD:
  3406. if (not == 0) {
  3407. for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
  3408. if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
  3409. }
  3410. ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
  3411. }
  3412. else {
  3413. for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
  3414. if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
  3415. && ! ONIGENC_IS_CODE_WORD(enc, c))
  3416. BITSET_SET_BIT(cc->bs, c);
  3417. }
  3418. }
  3419. break;
  3420. default:
  3421. return ONIGERR_PARSER_BUG;
  3422. break;
  3423. }
  3424. return r;
  3425. }
  3426. static int
  3427. parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
  3428. {
  3429. #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
  3430. #define POSIX_BRACKET_NAME_MIN_LEN 4
  3431. static PosixBracketEntryType PBS[] = {
  3432. { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
  3433. { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
  3434. { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
  3435. { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
  3436. { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
  3437. { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
  3438. { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
  3439. { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
  3440. { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
  3441. { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
  3442. { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
  3443. { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
  3444. { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
  3445. { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
  3446. { (UChar* )NULL, -1, 0 }
  3447. };
  3448. PosixBracketEntryType *pb;
  3449. int not, i, r;
  3450. OnigCodePoint c;
  3451. OnigEncoding enc = env->enc;
  3452. UChar *p = *src;
  3453. if (PPEEK_IS('^')) {
  3454. PINC_S;
  3455. not = 1;
  3456. }
  3457. else
  3458. not = 0;
  3459. if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
  3460. goto not_posix_bracket;
  3461. for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
  3462. if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
  3463. p = (UChar* )onigenc_step(enc, p, end, pb->len);
  3464. if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
  3465. return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
  3466. r = add_ctype_to_cc(cc, pb->ctype, not, env);
  3467. if (r != 0) return r;
  3468. PINC_S; PINC_S;
  3469. *src = p;
  3470. return 0;
  3471. }
  3472. }
  3473. not_posix_bracket:
  3474. c = 0;
  3475. i = 0;
  3476. while (!PEND && ((c = PPEEK) != ':') && c != ']') {
  3477. PINC_S;
  3478. if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
  3479. }
  3480. if (c == ':' && ! PEND) {
  3481. PINC_S;
  3482. if (! PEND) {
  3483. PFETCH_S(c);
  3484. if (c == ']')
  3485. return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
  3486. }
  3487. }
  3488. return 1; /* 1: is not POSIX bracket, but no error. */
  3489. }
  3490. static int
  3491. fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
  3492. {
  3493. int r;
  3494. OnigCodePoint c;
  3495. OnigEncoding enc = env->enc;
  3496. UChar *prev, *start, *p = *src;
  3497. r = 0;
  3498. start = prev = p;
  3499. while (!PEND) {
  3500. prev = p;
  3501. PFETCH_S(c);
  3502. if (c == '}') {
  3503. r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
  3504. if (r < 0) break;
  3505. *src = p;
  3506. return r;
  3507. }
  3508. else if (c == '(' || c == ')' || c == '{' || c == '|') {
  3509. r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
  3510. break;
  3511. }
  3512. }
  3513. onig_scan_env_set_error_string(env, r, *src, prev);
  3514. return r;
  3515. }
  3516. static int
  3517. parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
  3518. ScanEnv* env)
  3519. {
  3520. int r, ctype;
  3521. CClassNode* cc;
  3522. ctype = fetch_char_property_to_ctype(src, end, env);
  3523. if (ctype < 0) return ctype;
  3524. *np = node_new_cclass();
  3525. CHECK_NULL_RETURN_MEMERR(*np);
  3526. cc = NCCLASS(*np);
  3527. r = add_ctype_to_cc(cc, ctype, 0, env);
  3528. if (r != 0) return r;
  3529. if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
  3530. return 0;
  3531. }
  3532. enum CCSTATE {
  3533. CCS_VALUE,
  3534. CCS_RANGE,
  3535. CCS_COMPLETE,
  3536. CCS_START
  3537. };
  3538. enum CCVALTYPE {
  3539. CCV_SB,
  3540. CCV_CODE_POINT,
  3541. CCV_CLASS
  3542. };
  3543. static int
  3544. next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
  3545. enum CCSTATE* state, ScanEnv* env)
  3546. {
  3547. int r;
  3548. if (*state == CCS_RANGE)
  3549. return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
  3550. if (*state == CCS_VALUE && *type != CCV_CLASS) {
  3551. if (*type == CCV_SB)
  3552. BITSET_SET_BIT(cc->bs, (int )(*vs));
  3553. else if (*type == CCV_CODE_POINT) {
  3554. r = add_code_range(&(cc->mbuf), env, *vs, *vs);
  3555. if (r < 0) return r;
  3556. }
  3557. }
  3558. if (*state != CCS_START)
  3559. *state = CCS_VALUE;
  3560. *type = CCV_CLASS;
  3561. return 0;
  3562. }
  3563. static int
  3564. next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
  3565. int* vs_israw, int v_israw,
  3566. enum CCVALTYPE intype, enum CCVALTYPE* type,
  3567. enum CCSTATE* state, ScanEnv* env)
  3568. {
  3569. int r;
  3570. switch (*state) {
  3571. case CCS_VALUE:
  3572. if (*type == CCV_SB)
  3573. {
  3574. if (*vs > 0xff)
  3575. return ONIGERR_INVALID_CODE_POINT_VALUE;
  3576. BITSET_SET_BIT(cc->bs, (int )(*vs));
  3577. }
  3578. else if (*type == CCV_CODE_POINT) {
  3579. r = add_code_range(&(cc->mbuf), env, *vs, *vs);
  3580. if (r < 0) return r;
  3581. }
  3582. break;
  3583. case CCS_RANGE:
  3584. if (intype == *type) {
  3585. if (intype == CCV_SB) {
  3586. if (*vs > 0xff || v > 0xff)
  3587. return ONIGERR_INVALID_CODE_POINT_VALUE;
  3588. if (*vs > v) {
  3589. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
  3590. goto ccs_range_end;
  3591. else
  3592. return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
  3593. }
  3594. bitset_set_range(cc->bs, (int )*vs, (int )v);
  3595. }
  3596. else {
  3597. r = add_code_range(&(cc->mbuf), env, *vs, v);
  3598. if (r < 0) return r;
  3599. }
  3600. }
  3601. else {
  3602. #if 0
  3603. if (intype == CCV_CODE_POINT && *type == CCV_SB) {
  3604. #endif
  3605. if (*vs > v) {
  3606. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
  3607. goto ccs_range_end;
  3608. else
  3609. return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
  3610. }
  3611. bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
  3612. r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
  3613. if (r < 0) return r;
  3614. #if 0
  3615. }
  3616. else
  3617. return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
  3618. #endif
  3619. }
  3620. ccs_range_end:
  3621. *state = CCS_COMPLETE;
  3622. break;
  3623. case CCS_COMPLETE:
  3624. case CCS_START:
  3625. *state = CCS_VALUE;
  3626. break;
  3627. default:
  3628. break;
  3629. }
  3630. *vs_israw = v_israw;
  3631. *vs = v;
  3632. *type = intype;
  3633. return 0;
  3634. }
  3635. static int
  3636. code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
  3637. ScanEnv* env)
  3638. {
  3639. int in_esc;
  3640. OnigCodePoint code;
  3641. OnigEncoding enc = env->enc;
  3642. UChar* p = from;
  3643. in_esc = 0;
  3644. while (! PEND) {
  3645. if (ignore_escaped && in_esc) {
  3646. in_esc = 0;
  3647. }
  3648. else {
  3649. PFETCH_S(code);
  3650. if (code == c) return 1;
  3651. if (code == MC_ESC(env->syntax)) in_esc = 1;
  3652. }
  3653. }
  3654. return 0;
  3655. }
  3656. static int
  3657. parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
  3658. ScanEnv* env)
  3659. {
  3660. int r, neg, len, fetched, and_start;
  3661. OnigCodePoint v, vs;
  3662. UChar *p;
  3663. Node* node;
  3664. CClassNode *cc, *prev_cc;
  3665. CClassNode work_cc;
  3666. enum CCSTATE state;
  3667. enum CCVALTYPE val_type, in_type;
  3668. int val_israw, in_israw;
  3669. prev_cc = (CClassNode* )NULL;
  3670. *np = NULL_NODE;
  3671. r = fetch_token_in_cc(tok, src, end, env);
  3672. if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
  3673. neg = 1;
  3674. r = fetch_token_in_cc(tok, src, end, env);
  3675. }
  3676. else {
  3677. neg = 0;
  3678. }
  3679. if (r < 0) return r;
  3680. if (r == TK_CC_CLOSE) {
  3681. if (! code_exist_check((OnigCodePoint )']',
  3682. *src, env->pattern_end, 1, env))
  3683. return ONIGERR_EMPTY_CHAR_CLASS;
  3684. CC_ESC_WARN(env, (UChar* )"]");
  3685. r = tok->type = TK_CHAR; /* allow []...] */
  3686. }
  3687. *np = node = node_new_cclass();
  3688. CHECK_NULL_RETURN_MEMERR(node);
  3689. cc = NCCLASS(node);
  3690. and_start = 0;
  3691. state = CCS_START;
  3692. p = *src;
  3693. while (r != TK_CC_CLOSE) {
  3694. fetched = 0;
  3695. switch (r) {
  3696. case TK_CHAR:
  3697. len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
  3698. if (len > 1) {
  3699. in_type = CCV_CODE_POINT;
  3700. }
  3701. else if (len < 0) {
  3702. r = len;
  3703. goto err;
  3704. }
  3705. else {
  3706. sb_char:
  3707. in_type = CCV_SB;
  3708. }
  3709. v = (OnigCodePoint )tok->u.c;
  3710. in_israw = 0;
  3711. goto val_entry2;
  3712. break;
  3713. case TK_RAW_BYTE:
  3714. /* tok->base != 0 : octal or hexadec. */
  3715. if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
  3716. UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
  3717. UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
  3718. UChar* psave = p;
  3719. int i, base = tok->base;
  3720. buf[0] = tok->u.c;
  3721. for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
  3722. r = fetch_token_in_cc(tok, &p, end, env);
  3723. if (r < 0) goto err;
  3724. if (r != TK_RAW_BYTE || tok->base != base) {
  3725. fetched = 1;
  3726. break;
  3727. }
  3728. buf[i] = tok->u.c;
  3729. }
  3730. if (i < ONIGENC_MBC_MINLEN(env->enc)) {
  3731. r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
  3732. goto err;
  3733. }
  3734. len = enclen(env->enc, buf);
  3735. if (i < len) {
  3736. r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
  3737. goto err;
  3738. }
  3739. else if (i > len) { /* fetch back */
  3740. p = psave;
  3741. for (i = 1; i < len; i++) {
  3742. r = fetch_token_in_cc(tok, &p, end, env);
  3743. }
  3744. fetched = 0;
  3745. }
  3746. if (i == 1) {
  3747. v = (OnigCodePoint )buf[0];
  3748. goto raw_single;
  3749. }
  3750. else {
  3751. v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
  3752. in_type = CCV_CODE_POINT;
  3753. }
  3754. }
  3755. else {
  3756. v = (OnigCodePoint )tok->u.c;
  3757. raw_single:
  3758. in_type = CCV_SB;
  3759. }
  3760. in_israw = 1;
  3761. goto val_entry2;
  3762. break;
  3763. case TK_CODE_POINT:
  3764. v = tok->u.code;
  3765. in_israw = 1;
  3766. val_entry:
  3767. len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
  3768. if (len < 0) {
  3769. r = len;
  3770. goto err;
  3771. }
  3772. in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
  3773. val_entry2:
  3774. r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
  3775. &state, env);
  3776. if (r != 0) goto err;
  3777. break;
  3778. case TK_POSIX_BRACKET_OPEN:
  3779. r = parse_posix_bracket(cc, &p, end, env);
  3780. if (r < 0) goto err;
  3781. if (r == 1) { /* is not POSIX bracket */
  3782. CC_ESC_WARN(env, (UChar* )"[");
  3783. p = tok->backp;
  3784. v = (OnigCodePoint )tok->u.c;
  3785. in_israw = 0;
  3786. goto val_entry;
  3787. }
  3788. goto next_class;
  3789. break;
  3790. case TK_CHAR_TYPE:
  3791. r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
  3792. if (r != 0) return r;
  3793. next_class:
  3794. r = next_state_class(cc, &vs, &val_type, &state, env);
  3795. if (r != 0) goto err;
  3796. break;
  3797. case TK_CHAR_PROPERTY:
  3798. {
  3799. int ctype;
  3800. ctype = fetch_char_property_to_ctype(&p, end, env);
  3801. if (ctype < 0) return ctype;
  3802. r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
  3803. if (r != 0) return r;
  3804. goto next_class;
  3805. }
  3806. break;
  3807. case TK_CC_RANGE:
  3808. if (state == CCS_VALUE) {
  3809. r = fetch_token_in_cc(tok, &p, end, env);
  3810. if (r < 0) goto err;
  3811. fetched = 1;
  3812. if (r == TK_CC_CLOSE) { /* allow [x-] */
  3813. range_end_val:
  3814. v = (OnigCodePoint )'-';
  3815. in_israw = 0;
  3816. goto val_entry;
  3817. }
  3818. else if (r == TK_CC_AND) {
  3819. CC_ESC_WARN(env, (UChar* )"-");
  3820. goto range_end_val;
  3821. }
  3822. state = CCS_RANGE;
  3823. }
  3824. else if (state == CCS_START) {
  3825. /* [-xa] is allowed */
  3826. v = (OnigCodePoint )tok->u.c;
  3827. in_israw = 0;
  3828. r = fetch_token_in_cc(tok, &p, end, env);
  3829. if (r < 0) goto err;
  3830. fetched = 1;
  3831. /* [--x] or [a&&-x] is warned. */
  3832. if (r == TK_CC_RANGE || and_start != 0)
  3833. CC_ESC_WARN(env, (UChar* )"-");
  3834. goto val_entry;
  3835. }
  3836. else if (state == CCS_RANGE) {
  3837. CC_ESC_WARN(env, (UChar* )"-");
  3838. goto sb_char; /* [!--x] is allowed */
  3839. }
  3840. else { /* CCS_COMPLETE */
  3841. r = fetch_token_in_cc(tok, &p, end, env);
  3842. if (r < 0) goto err;
  3843. fetched = 1;
  3844. if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
  3845. else if (r == TK_CC_AND) {
  3846. CC_ESC_WARN(env, (UChar* )"-");
  3847. goto range_end_val;
  3848. }
  3849. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
  3850. CC_ESC_WARN(env, (UChar* )"-");
  3851. goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
  3852. }
  3853. r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
  3854. goto err;
  3855. }
  3856. break;
  3857. case TK_CC_CC_OPEN: /* [ */
  3858. {
  3859. Node *anode;
  3860. CClassNode* acc;
  3861. r = parse_char_class(&anode, tok, &p, end, env);
  3862. if (r != 0) goto cc_open_err;
  3863. acc = NCCLASS(anode);
  3864. r = or_cclass(cc, acc, env->enc);
  3865. onig_node_free(anode);
  3866. cc_open_err:
  3867. if (r != 0) goto err;
  3868. }
  3869. break;
  3870. case TK_CC_AND: /* && */
  3871. {
  3872. if (state == CCS_VALUE) {
  3873. r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
  3874. &val_type, &state, env);
  3875. if (r != 0) goto err;
  3876. }
  3877. /* initialize local variables */
  3878. and_start = 1;
  3879. state = CCS_START;
  3880. if (IS_NOT_NULL(prev_cc)) {
  3881. r = and_cclass(prev_cc, cc, env->enc);
  3882. if (r != 0) goto err;
  3883. bbuf_free(cc->mbuf);
  3884. }
  3885. else {
  3886. prev_cc = cc;
  3887. cc = &work_cc;
  3888. }
  3889. initialize_cclass(cc);
  3890. }
  3891. break;
  3892. case TK_EOT:
  3893. r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
  3894. goto err;
  3895. break;
  3896. default:
  3897. r = ONIGERR_PARSER_BUG;
  3898. goto err;
  3899. break;
  3900. }
  3901. if (fetched)
  3902. r = tok->type;
  3903. else {
  3904. r = fetch_token_in_cc(tok, &p, end, env);
  3905. if (r < 0) goto err;
  3906. }
  3907. }
  3908. if (state == CCS_VALUE) {
  3909. r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
  3910. &val_type, &state, env);
  3911. if (r != 0) goto err;
  3912. }
  3913. if (IS_NOT_NULL(prev_cc)) {
  3914. r = and_cclass(prev_cc, cc, env->enc);
  3915. if (r != 0) goto err;
  3916. bbuf_free(cc->mbuf);
  3917. cc = prev_cc;
  3918. }
  3919. if (neg != 0)
  3920. NCCLASS_SET_NOT(cc);
  3921. else
  3922. NCCLASS_CLEAR_NOT(cc);
  3923. if (IS_NCCLASS_NOT(cc) &&
  3924. IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
  3925. int is_empty;
  3926. is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
  3927. if (is_empty != 0)
  3928. BITSET_IS_EMPTY(cc->bs, is_empty);
  3929. if (is_empty == 0) {
  3930. #define NEWLINE_CODE 0x0a
  3931. if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
  3932. if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
  3933. BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
  3934. else
  3935. add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
  3936. }
  3937. }
  3938. }
  3939. *src = p;
  3940. return 0;
  3941. err:
  3942. if (cc != NCCLASS(*np))
  3943. bbuf_free(cc->mbuf);
  3944. onig_node_free(*np);
  3945. return r;
  3946. }
  3947. static int parse_subexp(Node** top, OnigToken* tok, int term,
  3948. UChar** src, UChar* end, ScanEnv* env);
  3949. static int
  3950. parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
  3951. ScanEnv* env)
  3952. {
  3953. int r, num;
  3954. Node *target;
  3955. OnigOptionType option;
  3956. OnigCodePoint c;
  3957. OnigEncoding enc = env->enc;
  3958. #ifdef USE_NAMED_GROUP
  3959. int list_capture;
  3960. #endif
  3961. UChar* p = *src;
  3962. PFETCH_READY;
  3963. *np = NULL;
  3964. if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
  3965. option = env->option;
  3966. if (PPEEK_IS('?') &&
  3967. IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
  3968. PINC;
  3969. if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
  3970. PFETCH(c);
  3971. switch (c) {
  3972. case ':': /* (?:...) grouping only */
  3973. group:
  3974. r = fetch_token(tok, &p, end, env);
  3975. if (r < 0) return r;
  3976. r = parse_subexp(np, tok, term, &p, end, env);
  3977. if (r < 0) return r;
  3978. *src = p;
  3979. return 1; /* group */
  3980. break;
  3981. case '=':
  3982. *np = onig_node_new_anchor(ANCHOR_PREC_READ);
  3983. break;
  3984. case '!': /* preceding read */
  3985. *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
  3986. break;
  3987. case '>': /* (?>...) stop backtrack */
  3988. *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
  3989. break;
  3990. #ifdef USE_NAMED_GROUP
  3991. case '\'':
  3992. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
  3993. goto named_group1;
  3994. }
  3995. else
  3996. return ONIGERR_UNDEFINED_GROUP_OPTION;
  3997. break;
  3998. #endif
  3999. case '<': /* look behind (?<=...), (?<!...) */
  4000. PFETCH(c);
  4001. if (c == '=')
  4002. *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
  4003. else if (c == '!')
  4004. *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
  4005. #ifdef USE_NAMED_GROUP
  4006. else {
  4007. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
  4008. UChar *name;
  4009. UChar *name_end;
  4010. PUNFETCH;
  4011. c = '<';
  4012. named_group1:
  4013. list_capture = 0;
  4014. named_group2:
  4015. name = p;
  4016. r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
  4017. if (r < 0) return r;
  4018. num = scan_env_add_mem_entry(env);
  4019. if (num < 0) return num;
  4020. if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
  4021. return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
  4022. r = name_add(env->reg, name, name_end, num, env);
  4023. if (r != 0) return r;
  4024. *np = node_new_enclose_memory(env->option, 1);
  4025. CHECK_NULL_RETURN_MEMERR(*np);
  4026. NENCLOSE(*np)->regnum = num;
  4027. if (list_capture != 0)
  4028. BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
  4029. env->num_named++;
  4030. }
  4031. else {
  4032. return ONIGERR_UNDEFINED_GROUP_OPTION;
  4033. }
  4034. }
  4035. #else
  4036. else {
  4037. return ONIGERR_UNDEFINED_GROUP_OPTION;
  4038. }
  4039. #endif
  4040. break;
  4041. case '@':
  4042. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
  4043. #ifdef USE_NAMED_GROUP
  4044. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
  4045. PFETCH(c);
  4046. if (c == '<' || c == '\'') {
  4047. list_capture = 1;
  4048. goto named_group2; /* (?@<name>...) */
  4049. }
  4050. PUNFETCH;
  4051. }
  4052. #endif
  4053. *np = node_new_enclose_memory(env->option, 0);
  4054. CHECK_NULL_RETURN_MEMERR(*np);
  4055. num = scan_env_add_mem_entry(env);
  4056. if (num < 0) {
  4057. onig_node_free(*np);
  4058. return num;
  4059. }
  4060. else if (num >= (int )BIT_STATUS_BITS_NUM) {
  4061. onig_node_free(*np);
  4062. return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
  4063. }
  4064. NENCLOSE(*np)->regnum = num;
  4065. BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
  4066. }
  4067. else {
  4068. return ONIGERR_UNDEFINED_GROUP_OPTION;
  4069. }
  4070. break;
  4071. #ifdef USE_POSIXLINE_OPTION
  4072. case 'p':
  4073. #endif
  4074. case '-': case 'i': case 'm': case 's': case 'x':
  4075. {
  4076. int neg = 0;
  4077. while (1) {
  4078. switch (c) {
  4079. case ':':
  4080. case ')':
  4081. break;
  4082. case '-': neg = 1; break;
  4083. case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
  4084. case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
  4085. case 's':
  4086. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
  4087. ONOFF(option, ONIG_OPTION_MULTILINE, neg);
  4088. }
  4089. else
  4090. return ONIGERR_UNDEFINED_GROUP_OPTION;
  4091. break;
  4092. case 'm':
  4093. if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
  4094. ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
  4095. }
  4096. else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
  4097. ONOFF(option, ONIG_OPTION_MULTILINE, neg);
  4098. }
  4099. else
  4100. return ONIGERR_UNDEFINED_GROUP_OPTION;
  4101. break;
  4102. #ifdef USE_POSIXLINE_OPTION
  4103. case 'p':
  4104. ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
  4105. break;
  4106. #endif
  4107. default:
  4108. return ONIGERR_UNDEFINED_GROUP_OPTION;
  4109. }
  4110. if (c == ')') {
  4111. *np = node_new_option(option);
  4112. CHECK_NULL_RETURN_MEMERR(*np);
  4113. *src = p;
  4114. return 2; /* option only */
  4115. }
  4116. else if (c == ':') {
  4117. OnigOptionType prev = env->option;
  4118. env->option = option;
  4119. r = fetch_token(tok, &p, end, env);
  4120. if (r < 0) return r;
  4121. r = parse_subexp(&target, tok, term, &p, end, env);
  4122. env->option = prev;
  4123. if (r < 0) return r;
  4124. *np = node_new_option(option);
  4125. CHECK_NULL_RETURN_MEMERR(*np);
  4126. NENCLOSE(*np)->target = target;
  4127. *src = p;
  4128. return 0;
  4129. }
  4130. if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
  4131. PFETCH(c);
  4132. }
  4133. }
  4134. break;
  4135. default:
  4136. return ONIGERR_UNDEFINED_GROUP_OPTION;
  4137. }
  4138. }
  4139. else {
  4140. if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
  4141. goto group;
  4142. *np = node_new_enclose_memory(env->option, 0);
  4143. CHECK_NULL_RETURN_MEMERR(*np);
  4144. num = scan_env_add_mem_entry(env);
  4145. if (num < 0) return num;
  4146. NENCLOSE(*np)->regnum = num;
  4147. }
  4148. CHECK_NULL_RETURN_MEMERR(*np);
  4149. r = fetch_token(tok, &p, end, env);
  4150. if (r < 0) return r;
  4151. r = parse_subexp(&target, tok, term, &p, end, env);
  4152. if (r < 0) return r;
  4153. if (NTYPE(*np) == NT_ANCHOR)
  4154. NANCHOR(*np)->target = target;
  4155. else {
  4156. NENCLOSE(*np)->target = target;
  4157. if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
  4158. /* Don't move this to previous of parse_subexp() */
  4159. r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
  4160. if (r != 0) return r;
  4161. }
  4162. }
  4163. *src = p;
  4164. return 0;
  4165. }
  4166. static const char* PopularQStr[] = {
  4167. "?", "*", "+", "??", "*?", "+?"
  4168. };
  4169. static const char* ReduceQStr[] = {
  4170. "", "", "*", "*?", "??", "+ and ??", "+? and ?"
  4171. };
  4172. static int
  4173. set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
  4174. {
  4175. QtfrNode* qn;
  4176. qn = NQTFR(qnode);
  4177. if (qn->lower == 1 && qn->upper == 1) {
  4178. return 1;
  4179. }
  4180. switch (NTYPE(target)) {
  4181. case NT_STR:
  4182. if (! group) {
  4183. StrNode* sn = NSTR(target);
  4184. if (str_node_can_be_split(sn, env->enc)) {
  4185. Node* n = str_node_split_last_char(sn, env->enc);
  4186. if (IS_NOT_NULL(n)) {
  4187. qn->target = n;
  4188. return 2;
  4189. }
  4190. }
  4191. }
  4192. break;
  4193. case NT_QTFR:
  4194. { /* check redundant double repeat. */
  4195. /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
  4196. QtfrNode* qnt = NQTFR(target);
  4197. int nestq_num = popular_quantifier_num(qn);
  4198. int targetq_num = popular_quantifier_num(qnt);
  4199. #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
  4200. if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
  4201. IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
  4202. UChar buf[WARN_BUFSIZE];
  4203. switch(ReduceTypeTable[targetq_num][nestq_num]) {
  4204. case RQ_ASIS:
  4205. break;
  4206. case RQ_DEL:
  4207. if (onig_verb_warn != onig_null_warn) {
  4208. onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
  4209. env->pattern, env->pattern_end,
  4210. (UChar* )"redundant nested repeat operator");
  4211. (*onig_verb_warn)((char* )buf);
  4212. }
  4213. goto warn_exit;
  4214. break;
  4215. default:
  4216. if (onig_verb_warn != onig_null_warn) {
  4217. onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
  4218. env->pattern, env->pattern_end,
  4219. (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
  4220. PopularQStr[targetq_num], PopularQStr[nestq_num],
  4221. ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
  4222. (*onig_verb_warn)((char* )buf);
  4223. }
  4224. goto warn_exit;
  4225. break;
  4226. }
  4227. }
  4228. warn_exit:
  4229. #endif
  4230. if (targetq_num >= 0) {
  4231. if (nestq_num >= 0) {
  4232. onig_reduce_nested_quantifier(qnode, target);
  4233. goto q_exit;
  4234. }
  4235. else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
  4236. /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
  4237. if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
  4238. qn->upper = (qn->lower == 0 ? 1 : qn->lower);
  4239. }
  4240. }
  4241. }
  4242. }
  4243. break;
  4244. default:
  4245. break;
  4246. }
  4247. qn->target = target;
  4248. q_exit:
  4249. return 0;
  4250. }
  4251. #ifdef USE_SHARED_CCLASS_TABLE
  4252. #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
  4253. /* for ctype node hash table */
  4254. typedef struct {
  4255. OnigEncoding enc;
  4256. int not;
  4257. int type;
  4258. } type_cclass_key;
  4259. static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
  4260. {
  4261. if (x->type != y->type) return 1;
  4262. if (x->enc != y->enc) return 1;
  4263. if (x->not != y->not) return 1;
  4264. return 0;
  4265. }
  4266. static int type_cclass_hash(type_cclass_key* key)
  4267. {
  4268. int i, val;
  4269. UChar *p;
  4270. val = 0;
  4271. p = (UChar* )&(key->enc);
  4272. for (i = 0; i < (int )sizeof(key->enc); i++) {
  4273. val = val * 997 + (int )*p++;
  4274. }
  4275. p = (UChar* )(&key->type);
  4276. for (i = 0; i < (int )sizeof(key->type); i++) {
  4277. val = val * 997 + (int )*p++;
  4278. }
  4279. val += key->not;
  4280. return val + (val >> 5);
  4281. }
  4282. static struct st_hash_type type_type_cclass_hash = {
  4283. type_cclass_cmp,
  4284. type_cclass_hash,
  4285. };
  4286. static st_table* OnigTypeCClassTable;
  4287. static int
  4288. i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
  4289. {
  4290. if (IS_NOT_NULL(node)) {
  4291. CClassNode* cc = NCCLASS(node);
  4292. if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
  4293. xfree(node);
  4294. }
  4295. if (IS_NOT_NULL(key)) xfree(key);
  4296. return ST_DELETE;
  4297. }
  4298. extern int
  4299. onig_free_shared_cclass_table(void)
  4300. {
  4301. if (IS_NOT_NULL(OnigTypeCClassTable)) {
  4302. onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
  4303. onig_st_free_table(OnigTypeCClassTable);
  4304. OnigTypeCClassTable = NULL;
  4305. }
  4306. return 0;
  4307. }
  4308. #endif /* USE_SHARED_CCLASS_TABLE */
  4309. #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
  4310. static int
  4311. clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
  4312. {
  4313. BBuf *tbuf;
  4314. int r;
  4315. if (IS_NCCLASS_NOT(cc)) {
  4316. bitset_invert(cc->bs);
  4317. if (! ONIGENC_IS_SINGLEBYTE(enc)) {
  4318. r = not_code_range_buf(enc, cc->mbuf, &tbuf);
  4319. if (r != 0) return r;
  4320. bbuf_free(cc->mbuf);
  4321. cc->mbuf = tbuf;
  4322. }
  4323. NCCLASS_CLEAR_NOT(cc);
  4324. }
  4325. return 0;
  4326. }
  4327. #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
  4328. typedef struct {
  4329. ScanEnv* env;
  4330. CClassNode* cc;
  4331. Node* alt_root;
  4332. Node** ptail;
  4333. } IApplyCaseFoldArg;
  4334. static int
  4335. i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
  4336. int to_len, void* arg)
  4337. {
  4338. IApplyCaseFoldArg* iarg;
  4339. ScanEnv* env;
  4340. CClassNode* cc;
  4341. BitSetRef bs;
  4342. iarg = (IApplyCaseFoldArg* )arg;
  4343. env = iarg->env;
  4344. cc = iarg->cc;
  4345. bs = cc->bs;
  4346. if (to_len == 1) {
  4347. int is_in = onig_is_code_in_cc(env->enc, from, cc);
  4348. #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
  4349. if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
  4350. (is_in == 0 && IS_NCCLASS_NOT(cc))) {
  4351. if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
  4352. add_code_range(&(cc->mbuf), env, *to, *to);
  4353. }
  4354. else {
  4355. BITSET_SET_BIT(bs, *to);
  4356. }
  4357. }
  4358. #else
  4359. if (is_in != 0) {
  4360. if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
  4361. if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
  4362. add_code_range(&(cc->mbuf), env, *to, *to);
  4363. }
  4364. else {
  4365. if (IS_NCCLASS_NOT(cc)) {
  4366. BITSET_CLEAR_BIT(bs, *to);
  4367. }
  4368. else
  4369. BITSET_SET_BIT(bs, *to);
  4370. }
  4371. }
  4372. #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
  4373. }
  4374. else {
  4375. int r, i, len;
  4376. UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
  4377. Node *snode = NULL_NODE;
  4378. if (onig_is_code_in_cc(env->enc, from, cc)
  4379. #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
  4380. && !IS_NCCLASS_NOT(cc)
  4381. #endif
  4382. ) {
  4383. for (i = 0; i < to_len; i++) {
  4384. len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
  4385. if (i == 0) {
  4386. snode = onig_node_new_str(buf, buf + len);
  4387. CHECK_NULL_RETURN_MEMERR(snode);
  4388. /* char-class expanded multi-char only
  4389. compare with string folded at match time. */
  4390. NSTRING_SET_AMBIG(snode);
  4391. }
  4392. else {
  4393. r = onig_node_str_cat(snode, buf, buf + len);
  4394. if (r < 0) {
  4395. onig_node_free(snode);
  4396. return r;
  4397. }
  4398. }
  4399. }
  4400. *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
  4401. CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
  4402. iarg->ptail = &(NCDR((*(iarg->ptail))));
  4403. }
  4404. }
  4405. return 0;
  4406. }
  4407. static int
  4408. parse_exp(Node** np, OnigToken* tok, int term,
  4409. UChar** src, UChar* end, ScanEnv* env)
  4410. {
  4411. int r, len, group = 0;
  4412. Node* qn;
  4413. Node** targetp;
  4414. *np = NULL;
  4415. if (tok->type == (enum TokenSyms )term)
  4416. goto end_of_token;
  4417. switch (tok->type) {
  4418. case TK_ALT:
  4419. case TK_EOT:
  4420. end_of_token:
  4421. *np = node_new_empty();
  4422. return tok->type;
  4423. break;
  4424. case TK_SUBEXP_OPEN:
  4425. r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
  4426. if (r < 0) return r;
  4427. if (r == 1) group = 1;
  4428. else if (r == 2) { /* option only */
  4429. Node* target;
  4430. OnigOptionType prev = env->option;
  4431. env->option = NENCLOSE(*np)->option;
  4432. r = fetch_token(tok, src, end, env);
  4433. if (r < 0) return r;
  4434. r = parse_subexp(&target, tok, term, src, end, env);
  4435. env->option = prev;
  4436. if (r < 0) return r;
  4437. NENCLOSE(*np)->target = target;
  4438. return tok->type;
  4439. }
  4440. break;
  4441. case TK_SUBEXP_CLOSE:
  4442. if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
  4443. return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
  4444. if (tok->escaped) goto tk_raw_byte;
  4445. else goto tk_byte;
  4446. break;
  4447. case TK_STRING:
  4448. tk_byte:
  4449. {
  4450. *np = node_new_str(tok->backp, *src);
  4451. CHECK_NULL_RETURN_MEMERR(*np);
  4452. while (1) {
  4453. r = fetch_token(tok, src, end, env);
  4454. if (r < 0) return r;
  4455. if (r != TK_STRING) break;
  4456. r = onig_node_str_cat(*np, tok->backp, *src);
  4457. if (r < 0) return r;
  4458. }
  4459. string_end:
  4460. targetp = np;
  4461. goto repeat;
  4462. }
  4463. break;
  4464. case TK_RAW_BYTE:
  4465. tk_raw_byte:
  4466. {
  4467. *np = node_new_str_raw_char((UChar )tok->u.c);
  4468. CHECK_NULL_RETURN_MEMERR(*np);
  4469. len = 1;
  4470. while (1) {
  4471. if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
  4472. if (len == enclen(env->enc, NSTR(*np)->s)) {
  4473. r = fetch_token(tok, src, end, env);
  4474. NSTRING_CLEAR_RAW(*np);
  4475. goto string_end;
  4476. }
  4477. }
  4478. r = fetch_token(tok, src, end, env);
  4479. if (r < 0) return r;
  4480. if (r != TK_RAW_BYTE) {
  4481. /* Don't use this, it is wrong for little endian encodings. */
  4482. #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
  4483. int rem;
  4484. if (len < ONIGENC_MBC_MINLEN(env->enc)) {
  4485. rem = ONIGENC_MBC_MINLEN(env->enc) - len;
  4486. (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
  4487. if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
  4488. NSTRING_CLEAR_RAW(*np);
  4489. goto string_end;
  4490. }
  4491. }
  4492. #endif
  4493. return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
  4494. }
  4495. r = node_str_cat_char(*np, (UChar )tok->u.c);
  4496. if (r < 0) return r;
  4497. len++;
  4498. }
  4499. }
  4500. break;
  4501. case TK_CODE_POINT:
  4502. {
  4503. UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
  4504. int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
  4505. if (num < 0) return num;
  4506. #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
  4507. *np = node_new_str_raw(buf, buf + num);
  4508. #else
  4509. *np = node_new_str(buf, buf + num);
  4510. #endif
  4511. CHECK_NULL_RETURN_MEMERR(*np);
  4512. }
  4513. break;
  4514. case TK_QUOTE_OPEN:
  4515. {
  4516. OnigCodePoint end_op[2];
  4517. UChar *qstart, *qend, *nextp;
  4518. end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
  4519. end_op[1] = (OnigCodePoint )'E';
  4520. qstart = *src;
  4521. qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
  4522. if (IS_NULL(qend)) {
  4523. nextp = qend = end;
  4524. }
  4525. *np = node_new_str(qstart, qend);
  4526. CHECK_NULL_RETURN_MEMERR(*np);
  4527. *src = nextp;
  4528. }
  4529. break;
  4530. case TK_CHAR_TYPE:
  4531. {
  4532. switch (tok->u.prop.ctype) {
  4533. case ONIGENC_CTYPE_WORD:
  4534. *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
  4535. CHECK_NULL_RETURN_MEMERR(*np);
  4536. break;
  4537. case ONIGENC_CTYPE_SPACE:
  4538. case ONIGENC_CTYPE_DIGIT:
  4539. case ONIGENC_CTYPE_XDIGIT:
  4540. {
  4541. CClassNode* cc;
  4542. #ifdef USE_SHARED_CCLASS_TABLE
  4543. const OnigCodePoint *mbr;
  4544. OnigCodePoint sb_out;
  4545. r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
  4546. &sb_out, &mbr);
  4547. if (r == 0 &&
  4548. ONIGENC_CODE_RANGE_NUM(mbr)
  4549. >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
  4550. type_cclass_key key;
  4551. type_cclass_key* new_key;
  4552. key.enc = env->enc;
  4553. key.not = tok->u.prop.not;
  4554. key.type = tok->u.prop.ctype;
  4555. THREAD_ATOMIC_START;
  4556. if (IS_NULL(OnigTypeCClassTable)) {
  4557. OnigTypeCClassTable
  4558. = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
  4559. if (IS_NULL(OnigTypeCClassTable)) {
  4560. THREAD_ATOMIC_END;
  4561. return ONIGERR_MEMORY;
  4562. }
  4563. }
  4564. else {
  4565. if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
  4566. (st_data_t* )np)) {
  4567. THREAD_ATOMIC_END;
  4568. break;
  4569. }
  4570. }
  4571. *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
  4572. sb_out, mbr);
  4573. if (IS_NULL(*np)) {
  4574. THREAD_ATOMIC_END;
  4575. return ONIGERR_MEMORY;
  4576. }
  4577. cc = NCCLASS(*np);
  4578. NCCLASS_SET_SHARE(cc);
  4579. new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
  4580. xmemcpy(new_key, &key, sizeof(type_cclass_key));
  4581. onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
  4582. (st_data_t )*np);
  4583. THREAD_ATOMIC_END;
  4584. }
  4585. else {
  4586. #endif
  4587. *np = node_new_cclass();
  4588. CHECK_NULL_RETURN_MEMERR(*np);
  4589. cc = NCCLASS(*np);
  4590. add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
  4591. if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
  4592. #ifdef USE_SHARED_CCLASS_TABLE
  4593. }
  4594. #endif
  4595. }
  4596. break;
  4597. default:
  4598. return ONIGERR_PARSER_BUG;
  4599. break;
  4600. }
  4601. }
  4602. break;
  4603. case TK_CHAR_PROPERTY:
  4604. r = parse_char_property(np, tok, src, end, env);
  4605. if (r != 0) return r;
  4606. break;
  4607. case TK_CC_OPEN:
  4608. {
  4609. CClassNode* cc;
  4610. r = parse_char_class(np, tok, src, end, env);
  4611. if (r != 0) return r;
  4612. cc = NCCLASS(*np);
  4613. if (IS_IGNORECASE(env->option)) {
  4614. IApplyCaseFoldArg iarg;
  4615. iarg.env = env;
  4616. iarg.cc = cc;
  4617. iarg.alt_root = NULL_NODE;
  4618. iarg.ptail = &(iarg.alt_root);
  4619. r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
  4620. i_apply_case_fold, &iarg);
  4621. if (r != 0) {
  4622. onig_node_free(iarg.alt_root);
  4623. return r;
  4624. }
  4625. if (IS_NOT_NULL(iarg.alt_root)) {
  4626. Node* work = onig_node_new_alt(*np, iarg.alt_root);
  4627. if (IS_NULL(work)) {
  4628. onig_node_free(iarg.alt_root);
  4629. return ONIGERR_MEMORY;
  4630. }
  4631. *np = work;
  4632. }
  4633. }
  4634. }
  4635. break;
  4636. case TK_ANYCHAR:
  4637. *np = node_new_anychar();
  4638. CHECK_NULL_RETURN_MEMERR(*np);
  4639. break;
  4640. case TK_ANYCHAR_ANYTIME:
  4641. *np = node_new_anychar();
  4642. CHECK_NULL_RETURN_MEMERR(*np);
  4643. qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
  4644. CHECK_NULL_RETURN_MEMERR(qn);
  4645. NQTFR(qn)->target = *np;
  4646. *np = qn;
  4647. break;
  4648. case TK_BACKREF:
  4649. len = tok->u.backref.num;
  4650. *np = node_new_backref(len,
  4651. (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
  4652. tok->u.backref.by_name,
  4653. #ifdef USE_BACKREF_WITH_LEVEL
  4654. tok->u.backref.exist_level,
  4655. tok->u.backref.level,
  4656. #endif
  4657. env);
  4658. CHECK_NULL_RETURN_MEMERR(*np);
  4659. break;
  4660. #ifdef USE_SUBEXP_CALL
  4661. case TK_CALL:
  4662. {
  4663. int gnum = tok->u.call.gnum;
  4664. if (gnum < 0) {
  4665. gnum = BACKREF_REL_TO_ABS(gnum, env);
  4666. if (gnum <= 0)
  4667. return ONIGERR_INVALID_BACKREF;
  4668. }
  4669. *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
  4670. CHECK_NULL_RETURN_MEMERR(*np);
  4671. env->num_call++;
  4672. }
  4673. break;
  4674. #endif
  4675. case TK_ANCHOR:
  4676. *np = onig_node_new_anchor(tok->u.anchor);
  4677. break;
  4678. case TK_OP_REPEAT:
  4679. case TK_INTERVAL:
  4680. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
  4681. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
  4682. return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
  4683. else
  4684. *np = node_new_empty();
  4685. }
  4686. else {
  4687. goto tk_byte;
  4688. }
  4689. break;
  4690. default:
  4691. return ONIGERR_PARSER_BUG;
  4692. break;
  4693. }
  4694. {
  4695. targetp = np;
  4696. re_entry:
  4697. r = fetch_token(tok, src, end, env);
  4698. if (r < 0) return r;
  4699. repeat:
  4700. if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
  4701. if (is_invalid_quantifier_target(*targetp))
  4702. return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
  4703. qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
  4704. (r == TK_INTERVAL ? 1 : 0));
  4705. CHECK_NULL_RETURN_MEMERR(qn);
  4706. NQTFR(qn)->greedy = tok->u.repeat.greedy;
  4707. r = set_quantifier(qn, *targetp, group, env);
  4708. if (r < 0) {
  4709. onig_node_free(qn);
  4710. return r;
  4711. }
  4712. if (tok->u.repeat.possessive != 0) {
  4713. Node* en;
  4714. en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
  4715. if (IS_NULL(en)) {
  4716. onig_node_free(qn);
  4717. return ONIGERR_MEMORY;
  4718. }
  4719. NENCLOSE(en)->target = qn;
  4720. qn = en;
  4721. }
  4722. if (r == 0) {
  4723. *targetp = qn;
  4724. }
  4725. else if (r == 1) {
  4726. onig_node_free(qn);
  4727. }
  4728. else if (r == 2) { /* split case: /abc+/ */
  4729. Node *tmp;
  4730. *targetp = node_new_list(*targetp, NULL);
  4731. if (IS_NULL(*targetp)) {
  4732. onig_node_free(qn);
  4733. return ONIGERR_MEMORY;
  4734. }
  4735. tmp = NCDR(*targetp) = node_new_list(qn, NULL);
  4736. if (IS_NULL(tmp)) {
  4737. onig_node_free(qn);
  4738. return ONIGERR_MEMORY;
  4739. }
  4740. targetp = &(NCAR(tmp));
  4741. }
  4742. goto re_entry;
  4743. }
  4744. }
  4745. return r;
  4746. }
  4747. static int
  4748. parse_branch(Node** top, OnigToken* tok, int term,
  4749. UChar** src, UChar* end, ScanEnv* env)
  4750. {
  4751. int r;
  4752. Node *node, **headp;
  4753. *top = NULL;
  4754. r = parse_exp(&node, tok, term, src, end, env);
  4755. if (r < 0) return r;
  4756. if (r == TK_EOT || r == term || r == TK_ALT) {
  4757. *top = node;
  4758. }
  4759. else {
  4760. *top = node_new_list(node, NULL);
  4761. headp = &(NCDR(*top));
  4762. while (r != TK_EOT && r != term && r != TK_ALT) {
  4763. r = parse_exp(&node, tok, term, src, end, env);
  4764. if (r < 0) return r;
  4765. if (NTYPE(node) == NT_LIST) {
  4766. *headp = node;
  4767. while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
  4768. headp = &(NCDR(node));
  4769. }
  4770. else {
  4771. *headp = node_new_list(node, NULL);
  4772. headp = &(NCDR(*headp));
  4773. }
  4774. }
  4775. }
  4776. return r;
  4777. }
  4778. /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
  4779. static int
  4780. parse_subexp(Node** top, OnigToken* tok, int term,
  4781. UChar** src, UChar* end, ScanEnv* env)
  4782. {
  4783. int r;
  4784. Node *node, **headp;
  4785. *top = NULL;
  4786. r = parse_branch(&node, tok, term, src, end, env);
  4787. if (r < 0) {
  4788. onig_node_free(node);
  4789. return r;
  4790. }
  4791. if (r == term) {
  4792. *top = node;
  4793. }
  4794. else if (r == TK_ALT) {
  4795. *top = onig_node_new_alt(node, NULL);
  4796. headp = &(NCDR(*top));
  4797. while (r == TK_ALT) {
  4798. r = fetch_token(tok, src, end, env);
  4799. if (r < 0) return r;
  4800. r = parse_branch(&node, tok, term, src, end, env);
  4801. if (r < 0) return r;
  4802. *headp = onig_node_new_alt(node, NULL);
  4803. headp = &(NCDR(*headp));
  4804. }
  4805. if (tok->type != (enum TokenSyms )term)
  4806. goto err;
  4807. }
  4808. else {
  4809. err:
  4810. if (term == TK_SUBEXP_CLOSE)
  4811. return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
  4812. else
  4813. return ONIGERR_PARSER_BUG;
  4814. }
  4815. return r;
  4816. }
  4817. static int
  4818. parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
  4819. {
  4820. int r;
  4821. OnigToken tok;
  4822. r = fetch_token(&tok, src, end, env);
  4823. if (r < 0) return r;
  4824. r = parse_subexp(top, &tok, TK_EOT, src, end, env);
  4825. if (r < 0) return r;
  4826. return 0;
  4827. }
  4828. extern int
  4829. onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
  4830. regex_t* reg, ScanEnv* env)
  4831. {
  4832. int r;
  4833. UChar* p;
  4834. #ifdef USE_NAMED_GROUP
  4835. names_clear(reg);
  4836. #endif
  4837. scan_env_clear(env);
  4838. env->option = reg->options;
  4839. env->case_fold_flag = reg->case_fold_flag;
  4840. env->enc = reg->enc;
  4841. env->syntax = reg->syntax;
  4842. env->pattern = (UChar* )pattern;
  4843. env->pattern_end = (UChar* )end;
  4844. env->reg = reg;
  4845. *root = NULL;
  4846. p = (UChar* )pattern;
  4847. r = parse_regexp(root, &p, (UChar* )end, env);
  4848. reg->num_mem = env->num_mem;
  4849. return r;
  4850. }
  4851. extern void
  4852. onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
  4853. UChar* arg, UChar* arg_end)
  4854. {
  4855. env->error = arg;
  4856. env->error_end = arg_end;
  4857. }