engine.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019
  1. /*
  2. * The matching engine and friends. This file is #included by regexec.c
  3. * after suitable #defines of a variety of macros used herein, so that
  4. * different state representations can be used without duplicating masses
  5. * of code.
  6. */
  7. #ifdef SNAMES
  8. #define matcher smatcher
  9. #define fast sfast
  10. #define slow sslow
  11. #define dissect sdissect
  12. #define backref sbackref
  13. #define step sstep
  14. #define print sprint
  15. #define at sat
  16. #define match smat
  17. #endif
  18. #ifdef LNAMES
  19. #define matcher lmatcher
  20. #define fast lfast
  21. #define slow lslow
  22. #define dissect ldissect
  23. #define backref lbackref
  24. #define step lstep
  25. #define print lprint
  26. #define at lat
  27. #define match lmat
  28. #endif
  29. /* another structure passed up and down to avoid zillions of parameters */
  30. struct match {
  31. struct re_guts *g;
  32. int eflags;
  33. regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
  34. unsigned char *offp; /* offsets work from here */
  35. unsigned char *beginp; /* start of string -- virtual NUL precedes */
  36. unsigned char *endp; /* end of string -- virtual NUL here */
  37. unsigned char *coldp; /* can be no match starting before here */
  38. unsigned char **lastpos; /* [nplus+1] */
  39. STATEVARS;
  40. states st; /* current states */
  41. states fresh; /* states for a fresh start */
  42. states tmp; /* temporary */
  43. states empty; /* empty set of states */
  44. };
  45. #include "engine.ih"
  46. #ifdef REDEBUG
  47. #define SP(t, s, c) print(m, t, s, c, stdout)
  48. #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
  49. #define NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
  50. #else
  51. #define SP(t, s, c) /* nothing */
  52. #define AT(t, p1, p2, s1, s2) /* nothing */
  53. #define NOTE(s) /* nothing */
  54. #endif
  55. /*
  56. - matcher - the actual matching engine
  57. == static int matcher(register struct re_guts *g, char *string, \
  58. == size_t nmatch, regmatch_t pmatch[], int eflags);
  59. */
  60. static int /* 0 success, REG_NOMATCH failure */
  61. matcher(g, string, nmatch, pmatch, eflags)
  62. register struct re_guts *g;
  63. unsigned char *string;
  64. size_t nmatch;
  65. regmatch_t pmatch[];
  66. int eflags;
  67. {
  68. register unsigned char *endp;
  69. register size_t i;
  70. struct match mv;
  71. register struct match *m = &mv;
  72. register unsigned char *dp;
  73. const register sopno gf = g->firststate+1; /* +1 for OEND */
  74. const register sopno gl = g->laststate;
  75. unsigned char *start;
  76. unsigned char *stop;
  77. /* simplify the situation where possible */
  78. if (g->cflags&REG_NOSUB)
  79. nmatch = 0;
  80. if (eflags&REG_STARTEND) {
  81. start = string + pmatch[0].rm_so;
  82. stop = string + pmatch[0].rm_eo;
  83. } else {
  84. start = string;
  85. stop = start + strlen(start);
  86. }
  87. if (stop < start)
  88. return(REG_INVARG);
  89. /* prescreening; this does wonders for this rather slow code */
  90. if (g->must != NULL) {
  91. for (dp = start; dp < stop; dp++)
  92. if (*dp == g->must[0] && stop - dp >= g->mlen &&
  93. memcmp(dp, g->must, (size_t)g->mlen) == 0)
  94. break;
  95. if (dp == stop) /* we didn't find g->must */
  96. return(REG_NOMATCH);
  97. }
  98. /* match struct setup */
  99. m->g = g;
  100. m->eflags = eflags;
  101. m->pmatch = NULL;
  102. m->lastpos = NULL;
  103. m->offp = string;
  104. m->beginp = start;
  105. m->endp = stop;
  106. STATESETUP(m, 4);
  107. SETUP(m->st);
  108. SETUP(m->fresh);
  109. SETUP(m->tmp);
  110. SETUP(m->empty);
  111. CLEAR(m->empty);
  112. /* this loop does only one repetition except for backrefs */
  113. for (;;) {
  114. endp = fast(m, start, stop, gf, gl);
  115. if (endp == NULL) { /* a miss */
  116. STATETEARDOWN(m);
  117. return(REG_NOMATCH);
  118. }
  119. if (nmatch == 0 && !g->backrefs)
  120. break; /* no further info needed */
  121. /* where? */
  122. assert(m->coldp != NULL);
  123. for (;;) {
  124. NOTE("finding start");
  125. endp = slow(m, m->coldp, stop, gf, gl);
  126. if (endp != NULL)
  127. break;
  128. assert(m->coldp < m->endp);
  129. m->coldp++;
  130. }
  131. if (nmatch == 1 && !g->backrefs)
  132. break; /* no further info needed */
  133. /* oh my, he wants the subexpressions... */
  134. if (m->pmatch == NULL)
  135. m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
  136. sizeof(regmatch_t));
  137. if (m->pmatch == NULL) {
  138. STATETEARDOWN(m);
  139. return(REG_ESPACE);
  140. }
  141. for (i = 1; i <= m->g->nsub; i++)
  142. m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
  143. if (!g->backrefs && !(m->eflags&REG_BACKR)) {
  144. NOTE("dissecting");
  145. dp = dissect(m, m->coldp, endp, gf, gl);
  146. } else {
  147. if (g->nplus > 0 && m->lastpos == NULL)
  148. m->lastpos = (unsigned char **)malloc((g->nplus+1) *
  149. sizeof(unsigned char *));
  150. if (g->nplus > 0 && m->lastpos == NULL) {
  151. free((char *)m->pmatch);
  152. STATETEARDOWN(m);
  153. return(REG_ESPACE);
  154. }
  155. NOTE("backref dissect");
  156. dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
  157. }
  158. if (dp != NULL)
  159. break;
  160. /* uh-oh... we couldn't find a subexpression-level match */
  161. assert(g->backrefs); /* must be back references doing it */
  162. assert(g->nplus == 0 || m->lastpos != NULL);
  163. for (;;) {
  164. if (dp != NULL || endp <= m->coldp)
  165. break; /* defeat */
  166. NOTE("backoff");
  167. endp = slow(m, m->coldp, endp-1, gf, gl);
  168. if (endp == NULL)
  169. break; /* defeat */
  170. /* try it on a shorter possibility */
  171. #ifndef NDEBUG
  172. for (i = 1; i <= m->g->nsub; i++) {
  173. assert(m->pmatch[i].rm_so == -1);
  174. assert(m->pmatch[i].rm_eo == -1);
  175. }
  176. #endif
  177. NOTE("backoff dissect");
  178. dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
  179. }
  180. assert(dp == NULL || dp == endp);
  181. if (dp != NULL) /* found a shorter one */
  182. break;
  183. /* despite initial appearances, there is no match here */
  184. NOTE("false alarm");
  185. start = m->coldp + 1; /* recycle starting later */
  186. assert(start <= stop);
  187. }
  188. /* fill in the details if requested */
  189. if (nmatch > 0) {
  190. pmatch[0].rm_so = m->coldp - m->offp;
  191. pmatch[0].rm_eo = endp - m->offp;
  192. }
  193. if (nmatch > 1) {
  194. assert(m->pmatch != NULL);
  195. for (i = 1; i < nmatch; i++)
  196. if (i <= m->g->nsub)
  197. pmatch[i] = m->pmatch[i];
  198. else {
  199. pmatch[i].rm_so = -1;
  200. pmatch[i].rm_eo = -1;
  201. }
  202. }
  203. if (m->pmatch != NULL)
  204. free((char *)m->pmatch);
  205. if (m->lastpos != NULL)
  206. free((char *)m->lastpos);
  207. STATETEARDOWN(m);
  208. return(0);
  209. }
  210. /*
  211. - dissect - figure out what matched what, no back references
  212. == static unsigned char *dissect(register struct match *m, unsigned char *start, \
  213. == unsigned char *stop, sopno startst, sopno stopst);
  214. */
  215. static unsigned char * /* == stop (success) always */
  216. dissect(m, start, stop, startst, stopst)
  217. register struct match *m;
  218. unsigned char *start;
  219. unsigned char *stop;
  220. sopno startst;
  221. sopno stopst;
  222. {
  223. register int i;
  224. register sopno ss; /* start sop of current subRE */
  225. register sopno es; /* end sop of current subRE */
  226. register unsigned char *sp; /* start of string matched by it */
  227. register unsigned char *stp; /* string matched by it cannot pass here */
  228. register unsigned char *rest; /* start of rest of string */
  229. register unsigned char *tail; /* string unmatched by rest of RE */
  230. register sopno ssub; /* start sop of subsubRE */
  231. register sopno esub; /* end sop of subsubRE */
  232. register unsigned char *ssp; /* start of string matched by subsubRE */
  233. register unsigned char *sep; /* end of string matched by subsubRE */
  234. register unsigned char *oldssp; /* previous ssp */
  235. register unsigned char *dp;
  236. AT("diss", start, stop, startst, stopst);
  237. sp = start;
  238. for (ss = startst; ss < stopst; ss = es) {
  239. /* identify end of subRE */
  240. es = ss;
  241. switch (OP(m->g->strip[es])) {
  242. case OPLUS_:
  243. case OQUEST_:
  244. es += OPND(m->g->strip[es]);
  245. break;
  246. case OCH_:
  247. while (OP(m->g->strip[es]) != O_CH)
  248. es += OPND(m->g->strip[es]);
  249. break;
  250. }
  251. es++;
  252. /* figure out what it matched */
  253. switch (OP(m->g->strip[ss])) {
  254. case OEND:
  255. assert(PHP_REGEX_NOPE);
  256. break;
  257. case OCHAR:
  258. sp++;
  259. break;
  260. case OBOL:
  261. case OEOL:
  262. case OBOW:
  263. case OEOW:
  264. break;
  265. case OANY:
  266. case OANYOF:
  267. sp++;
  268. break;
  269. case OBACK_:
  270. case O_BACK:
  271. assert(PHP_REGEX_NOPE);
  272. break;
  273. /* cases where length of match is hard to find */
  274. case OQUEST_:
  275. stp = stop;
  276. for (;;) {
  277. /* how long could this one be? */
  278. rest = slow(m, sp, stp, ss, es);
  279. assert(rest != NULL); /* it did match */
  280. /* could the rest match the rest? */
  281. tail = slow(m, rest, stop, es, stopst);
  282. if (tail == stop)
  283. break; /* yes! */
  284. /* no -- try a shorter match for this one */
  285. stp = rest - 1;
  286. assert(stp >= sp); /* it did work */
  287. }
  288. ssub = ss + 1;
  289. esub = es - 1;
  290. /* did innards match? */
  291. if (slow(m, sp, rest, ssub, esub) != NULL) {
  292. dp = dissect(m, sp, rest, ssub, esub);
  293. assert(dp == rest);
  294. } else /* no */
  295. assert(sp == rest);
  296. sp = rest;
  297. break;
  298. case OPLUS_:
  299. stp = stop;
  300. for (;;) {
  301. /* how long could this one be? */
  302. rest = slow(m, sp, stp, ss, es);
  303. assert(rest != NULL); /* it did match */
  304. /* could the rest match the rest? */
  305. tail = slow(m, rest, stop, es, stopst);
  306. if (tail == stop)
  307. break; /* yes! */
  308. /* no -- try a shorter match for this one */
  309. stp = rest - 1;
  310. assert(stp >= sp); /* it did work */
  311. }
  312. ssub = ss + 1;
  313. esub = es - 1;
  314. ssp = sp;
  315. oldssp = ssp;
  316. for (;;) { /* find last match of innards */
  317. sep = slow(m, ssp, rest, ssub, esub);
  318. if (sep == NULL || sep == ssp)
  319. break; /* failed or matched null */
  320. oldssp = ssp; /* on to next try */
  321. ssp = sep;
  322. }
  323. if (sep == NULL) {
  324. /* last successful match */
  325. sep = ssp;
  326. ssp = oldssp;
  327. }
  328. assert(sep == rest); /* must exhaust substring */
  329. assert(slow(m, ssp, sep, ssub, esub) == rest);
  330. dp = dissect(m, ssp, sep, ssub, esub);
  331. assert(dp == sep);
  332. sp = rest;
  333. break;
  334. case OCH_:
  335. stp = stop;
  336. for (;;) {
  337. /* how long could this one be? */
  338. rest = slow(m, sp, stp, ss, es);
  339. assert(rest != NULL); /* it did match */
  340. /* could the rest match the rest? */
  341. tail = slow(m, rest, stop, es, stopst);
  342. if (tail == stop)
  343. break; /* yes! */
  344. /* no -- try a shorter match for this one */
  345. stp = rest - 1;
  346. assert(stp >= sp); /* it did work */
  347. }
  348. ssub = ss + 1;
  349. esub = ss + OPND(m->g->strip[ss]) - 1;
  350. assert(OP(m->g->strip[esub]) == OOR1);
  351. for (;;) { /* find first matching branch */
  352. if (slow(m, sp, rest, ssub, esub) == rest)
  353. break; /* it matched all of it */
  354. /* that one missed, try next one */
  355. assert(OP(m->g->strip[esub]) == OOR1);
  356. esub++;
  357. assert(OP(m->g->strip[esub]) == OOR2);
  358. ssub = esub + 1;
  359. esub += OPND(m->g->strip[esub]);
  360. if (OP(m->g->strip[esub]) == OOR2)
  361. esub--;
  362. else
  363. assert(OP(m->g->strip[esub]) == O_CH);
  364. }
  365. dp = dissect(m, sp, rest, ssub, esub);
  366. assert(dp == rest);
  367. sp = rest;
  368. break;
  369. case O_PLUS:
  370. case O_QUEST:
  371. case OOR1:
  372. case OOR2:
  373. case O_CH:
  374. assert(PHP_REGEX_NOPE);
  375. break;
  376. case OLPAREN:
  377. i = OPND(m->g->strip[ss]);
  378. assert(0 < i && i <= m->g->nsub);
  379. m->pmatch[i].rm_so = sp - m->offp;
  380. break;
  381. case ORPAREN:
  382. i = OPND(m->g->strip[ss]);
  383. assert(0 < i && i <= m->g->nsub);
  384. m->pmatch[i].rm_eo = sp - m->offp;
  385. break;
  386. default: /* uh oh */
  387. assert(PHP_REGEX_NOPE);
  388. break;
  389. }
  390. }
  391. assert(sp == stop);
  392. return(sp);
  393. }
  394. /*
  395. - backref - figure out what matched what, figuring in back references
  396. == static unsigned char *backref(register struct match *m, unsigned char *start, \
  397. == unsigned char *stop, sopno startst, sopno stopst, sopno lev);
  398. */
  399. static unsigned char * /* == stop (success) or NULL (failure) */
  400. backref(m, start, stop, startst, stopst, lev)
  401. register struct match *m;
  402. unsigned char *start;
  403. unsigned char *stop;
  404. sopno startst;
  405. sopno stopst;
  406. sopno lev; /* PLUS nesting level */
  407. {
  408. register int i;
  409. register sopno ss; /* start sop of current subRE */
  410. register unsigned char *sp; /* start of string matched by it */
  411. register sopno ssub; /* start sop of subsubRE */
  412. register sopno esub; /* end sop of subsubRE */
  413. register unsigned char *ssp; /* start of string matched by subsubRE */
  414. register unsigned char *dp;
  415. register size_t len;
  416. register int hard;
  417. register sop s;
  418. register regoff_t offsave;
  419. register cset *cs;
  420. AT("back", start, stop, startst, stopst);
  421. sp = start;
  422. /* get as far as we can with easy stuff */
  423. hard = 0;
  424. for (ss = startst; !hard && ss < stopst; ss++)
  425. switch (OP(s = m->g->strip[ss])) {
  426. case OCHAR:
  427. if (sp == stop || *sp++ != (unsigned char)OPND(s))
  428. return(NULL);
  429. break;
  430. case OANY:
  431. if (sp == stop)
  432. return(NULL);
  433. sp++;
  434. break;
  435. case OANYOF:
  436. cs = &m->g->sets[OPND(s)];
  437. if (sp == stop || !CHIN(cs, *sp++))
  438. return(NULL);
  439. break;
  440. case OBOL:
  441. if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
  442. (sp < m->endp && *(sp-1) == '\n' &&
  443. (m->g->cflags&REG_NEWLINE)) )
  444. { /* yes */ }
  445. else
  446. return(NULL);
  447. break;
  448. case OEOL:
  449. if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
  450. (sp < m->endp && *sp == '\n' &&
  451. (m->g->cflags&REG_NEWLINE)) )
  452. { /* yes */ }
  453. else
  454. return(NULL);
  455. break;
  456. case OBOW:
  457. if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
  458. (sp < m->endp && *(sp-1) == '\n' &&
  459. (m->g->cflags&REG_NEWLINE)) ||
  460. (sp > m->beginp &&
  461. !ISWORD(*(sp-1))) ) &&
  462. (sp < m->endp && ISWORD(*sp)) )
  463. { /* yes */ }
  464. else
  465. return(NULL);
  466. break;
  467. case OEOW:
  468. if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
  469. (sp < m->endp && *sp == '\n' &&
  470. (m->g->cflags&REG_NEWLINE)) ||
  471. (sp < m->endp && !ISWORD(*sp)) ) &&
  472. (sp > m->beginp && ISWORD(*(sp-1))) )
  473. { /* yes */ }
  474. else
  475. return(NULL);
  476. break;
  477. case O_QUEST:
  478. break;
  479. case OOR1: /* matches null but needs to skip */
  480. ss++;
  481. s = m->g->strip[ss];
  482. do {
  483. assert(OP(s) == OOR2);
  484. ss += OPND(s);
  485. } while (OP(s = m->g->strip[ss]) != O_CH);
  486. /* note that the ss++ gets us past the O_CH */
  487. break;
  488. default: /* have to make a choice */
  489. hard = 1;
  490. break;
  491. }
  492. if (!hard) { /* that was it! */
  493. if (sp != stop)
  494. return(NULL);
  495. return(sp);
  496. }
  497. ss--; /* adjust for the for's final increment */
  498. /* the hard stuff */
  499. AT("hard", sp, stop, ss, stopst);
  500. s = m->g->strip[ss];
  501. switch (OP(s)) {
  502. case OBACK_: /* the vilest depths */
  503. i = OPND(s);
  504. assert(0 < i && i <= m->g->nsub);
  505. if (m->pmatch[i].rm_eo == -1)
  506. return(NULL);
  507. assert(m->pmatch[i].rm_so != -1);
  508. len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
  509. assert(stop - m->beginp >= len);
  510. if (sp > stop - len)
  511. return(NULL); /* not enough left to match */
  512. ssp = m->offp + m->pmatch[i].rm_so;
  513. if (memcmp(sp, ssp, len) != 0)
  514. return(NULL);
  515. while (m->g->strip[ss] != SOP(O_BACK, i))
  516. ss++;
  517. return(backref(m, sp+len, stop, ss+1, stopst, lev));
  518. break;
  519. case OQUEST_: /* to null or not */
  520. dp = backref(m, sp, stop, ss+1, stopst, lev);
  521. if (dp != NULL)
  522. return(dp); /* not */
  523. return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
  524. break;
  525. case OPLUS_:
  526. assert(m->lastpos != NULL);
  527. assert(lev+1 <= m->g->nplus);
  528. m->lastpos[lev+1] = sp;
  529. return(backref(m, sp, stop, ss+1, stopst, lev+1));
  530. break;
  531. case O_PLUS:
  532. if (sp == m->lastpos[lev]) /* last pass matched null */
  533. return(backref(m, sp, stop, ss+1, stopst, lev-1));
  534. /* try another pass */
  535. m->lastpos[lev] = sp;
  536. dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
  537. if (dp == NULL)
  538. return(backref(m, sp, stop, ss+1, stopst, lev-1));
  539. else
  540. return(dp);
  541. break;
  542. case OCH_: /* find the right one, if any */
  543. ssub = ss + 1;
  544. esub = ss + OPND(s) - 1;
  545. assert(OP(m->g->strip[esub]) == OOR1);
  546. for (;;) { /* find first matching branch */
  547. dp = backref(m, sp, stop, ssub, esub, lev);
  548. if (dp != NULL)
  549. return(dp);
  550. /* that one missed, try next one */
  551. if (OP(m->g->strip[esub]) == O_CH)
  552. return(NULL); /* there is none */
  553. esub++;
  554. assert(OP(m->g->strip[esub]) == OOR2);
  555. ssub = esub + 1;
  556. esub += OPND(m->g->strip[esub]);
  557. if (OP(m->g->strip[esub]) == OOR2)
  558. esub--;
  559. else
  560. assert(OP(m->g->strip[esub]) == O_CH);
  561. }
  562. break;
  563. case OLPAREN: /* must undo assignment if rest fails */
  564. i = OPND(s);
  565. assert(0 < i && i <= m->g->nsub);
  566. offsave = m->pmatch[i].rm_so;
  567. m->pmatch[i].rm_so = sp - m->offp;
  568. dp = backref(m, sp, stop, ss+1, stopst, lev);
  569. if (dp != NULL)
  570. return(dp);
  571. m->pmatch[i].rm_so = offsave;
  572. return(NULL);
  573. break;
  574. case ORPAREN: /* must undo assignment if rest fails */
  575. i = OPND(s);
  576. assert(0 < i && i <= m->g->nsub);
  577. offsave = m->pmatch[i].rm_eo;
  578. m->pmatch[i].rm_eo = sp - m->offp;
  579. dp = backref(m, sp, stop, ss+1, stopst, lev);
  580. if (dp != NULL)
  581. return(dp);
  582. m->pmatch[i].rm_eo = offsave;
  583. return(NULL);
  584. break;
  585. default: /* uh oh */
  586. assert(PHP_REGEX_NOPE);
  587. break;
  588. }
  589. /* "can't happen" */
  590. assert(PHP_REGEX_NOPE);
  591. /* NOTREACHED */
  592. return((unsigned char *)NULL); /* dummy */
  593. }
  594. /*
  595. - fast - step through the string at top speed
  596. == static unsigned char *fast(register struct match *m, unsigned char *start, \
  597. == unsigned char *stop, sopno startst, sopno stopst);
  598. */
  599. static unsigned char * /* where tentative match ended, or NULL */
  600. fast(m, start, stop, startst, stopst)
  601. register struct match *m;
  602. unsigned char *start;
  603. unsigned char *stop;
  604. sopno startst;
  605. sopno stopst;
  606. {
  607. register states st = m->st;
  608. register states fresh = m->fresh;
  609. register states tmp = m->tmp;
  610. register unsigned char *p = start;
  611. register int c = (start == m->beginp) ? OUT : *(start-1);
  612. register int lastc; /* previous c */
  613. register int flagch;
  614. register int i;
  615. register unsigned char *coldp; /* last p after which no match was underway */
  616. CLEAR(st);
  617. SET1(st, startst);
  618. st = step(m->g, startst, stopst, st, NOTHING, st);
  619. ASSIGN(fresh, st);
  620. SP("start", st, *p);
  621. coldp = NULL;
  622. for (;;) {
  623. /* next character */
  624. lastc = c;
  625. c = (p == m->endp) ? OUT : *p;
  626. if (EQ(st, fresh))
  627. coldp = p;
  628. /* is there an EOL and/or BOL between lastc and c? */
  629. flagch = '\0';
  630. i = 0;
  631. if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
  632. (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
  633. flagch = BOL;
  634. i = m->g->nbol;
  635. }
  636. if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
  637. (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
  638. flagch = (flagch == BOL) ? BOLEOL : EOL;
  639. i += m->g->neol;
  640. }
  641. if (i != 0) {
  642. for (; i > 0; i--)
  643. st = step(m->g, startst, stopst, st, flagch, st);
  644. SP("boleol", st, c);
  645. }
  646. /* how about a word boundary? */
  647. if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
  648. (c != OUT && ISWORD(c)) ) {
  649. flagch = BOW;
  650. }
  651. if ( (lastc != OUT && ISWORD(lastc)) &&
  652. (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
  653. flagch = EOW;
  654. }
  655. if (flagch == BOW || flagch == EOW) {
  656. st = step(m->g, startst, stopst, st, flagch, st);
  657. SP("boweow", st, c);
  658. }
  659. /* are we done? */
  660. if (ISSET(st, stopst) || p == stop)
  661. break; /* NOTE BREAK OUT */
  662. /* no, we must deal with this character */
  663. ASSIGN(tmp, st);
  664. ASSIGN(st, fresh);
  665. assert(c != OUT);
  666. st = step(m->g, startst, stopst, tmp, c, st);
  667. SP("aft", st, c);
  668. assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
  669. p++;
  670. }
  671. assert(coldp != NULL);
  672. m->coldp = coldp;
  673. if (ISSET(st, stopst))
  674. return(p+1);
  675. else
  676. return(NULL);
  677. }
  678. /*
  679. - slow - step through the string more deliberately
  680. == static unsigned char *slow(register struct match *m, unsigned char *start, \
  681. == unsigned char *stop, sopno startst, sopno stopst);
  682. */
  683. static unsigned char * /* where it ended */
  684. slow(m, start, stop, startst, stopst)
  685. register struct match *m;
  686. unsigned char *start;
  687. unsigned char *stop;
  688. sopno startst;
  689. sopno stopst;
  690. {
  691. register states st = m->st;
  692. register states empty = m->empty;
  693. register states tmp = m->tmp;
  694. register unsigned char *p = start;
  695. register int c = (start == m->beginp) ? OUT : *(start-1);
  696. register int lastc; /* previous c */
  697. register int flagch;
  698. register int i;
  699. register unsigned char *matchp; /* last p at which a match ended */
  700. AT("slow", start, stop, startst, stopst);
  701. CLEAR(st);
  702. SET1(st, startst);
  703. SP("sstart", st, *p);
  704. st = step(m->g, startst, stopst, st, NOTHING, st);
  705. matchp = NULL;
  706. for (;;) {
  707. /* next character */
  708. lastc = c;
  709. c = (p == m->endp) ? OUT : *p;
  710. /* is there an EOL and/or BOL between lastc and c? */
  711. flagch = '\0';
  712. i = 0;
  713. if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
  714. (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
  715. flagch = BOL;
  716. i = m->g->nbol;
  717. }
  718. if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
  719. (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
  720. flagch = (flagch == BOL) ? BOLEOL : EOL;
  721. i += m->g->neol;
  722. }
  723. if (i != 0) {
  724. for (; i > 0; i--)
  725. st = step(m->g, startst, stopst, st, flagch, st);
  726. SP("sboleol", st, c);
  727. }
  728. /* how about a word boundary? */
  729. if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
  730. (c != OUT && ISWORD(c)) ) {
  731. flagch = BOW;
  732. }
  733. if ( (lastc != OUT && ISWORD(lastc)) &&
  734. (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
  735. flagch = EOW;
  736. }
  737. if (flagch == BOW || flagch == EOW) {
  738. st = step(m->g, startst, stopst, st, flagch, st);
  739. SP("sboweow", st, c);
  740. }
  741. /* are we done? */
  742. if (ISSET(st, stopst))
  743. matchp = p;
  744. if (EQ(st, empty) || p == stop)
  745. break; /* NOTE BREAK OUT */
  746. /* no, we must deal with this character */
  747. ASSIGN(tmp, st);
  748. ASSIGN(st, empty);
  749. assert(c != OUT);
  750. st = step(m->g, startst, stopst, tmp, c, st);
  751. SP("saft", st, c);
  752. assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
  753. p++;
  754. }
  755. return(matchp);
  756. }
  757. /*
  758. - step - map set of states reachable before char to set reachable after
  759. == static states step(register struct re_guts *g, sopno start, sopno stop, \
  760. == register states bef, int ch, register states aft);
  761. == #define BOL (OUT+1)
  762. == #define EOL (BOL+1)
  763. == #define BOLEOL (BOL+2)
  764. == #define NOTHING (BOL+3)
  765. == #define BOW (BOL+4)
  766. == #define EOW (BOL+5)
  767. == #define CODEMAX (BOL+5) // highest code used
  768. == #define NONCHAR(c) ((c) > UCHAR_MAX)
  769. == #define NNONCHAR (CODEMAX-UCHAR_MAX)
  770. */
  771. static states
  772. step(g, start, stop, bef, ch, aft)
  773. register struct re_guts *g;
  774. sopno start; /* start state within strip */
  775. sopno stop; /* state after stop state within strip */
  776. register states bef; /* states reachable before */
  777. int ch; /* character or NONCHAR code */
  778. register states aft; /* states already known reachable after */
  779. {
  780. register cset *cs;
  781. register sop s;
  782. register sopno pc;
  783. register onestate here; /* note, macros know this name */
  784. register sopno look;
  785. register long i;
  786. for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
  787. s = g->strip[pc];
  788. switch (OP(s)) {
  789. case OEND:
  790. assert(pc == stop-1);
  791. break;
  792. case OCHAR:
  793. /* only characters can match */
  794. assert(!NONCHAR(ch) || ch != (unsigned char)OPND(s));
  795. if (ch == (unsigned char)OPND(s))
  796. FWD(aft, bef, 1);
  797. break;
  798. case OBOL:
  799. if (ch == BOL || ch == BOLEOL)
  800. FWD(aft, bef, 1);
  801. break;
  802. case OEOL:
  803. if (ch == EOL || ch == BOLEOL)
  804. FWD(aft, bef, 1);
  805. break;
  806. case OBOW:
  807. if (ch == BOW)
  808. FWD(aft, bef, 1);
  809. break;
  810. case OEOW:
  811. if (ch == EOW)
  812. FWD(aft, bef, 1);
  813. break;
  814. case OANY:
  815. if (!NONCHAR(ch))
  816. FWD(aft, bef, 1);
  817. break;
  818. case OANYOF:
  819. cs = &g->sets[OPND(s)];
  820. if (!NONCHAR(ch) && CHIN(cs, ch))
  821. FWD(aft, bef, 1);
  822. break;
  823. case OBACK_: /* ignored here */
  824. case O_BACK:
  825. FWD(aft, aft, 1);
  826. break;
  827. case OPLUS_: /* forward, this is just an empty */
  828. FWD(aft, aft, 1);
  829. break;
  830. case O_PLUS: /* both forward and back */
  831. FWD(aft, aft, 1);
  832. i = ISSETBACK(aft, OPND(s));
  833. BACK(aft, aft, OPND(s));
  834. if (!i && ISSETBACK(aft, OPND(s))) {
  835. /* oho, must reconsider loop body */
  836. pc -= OPND(s) + 1;
  837. INIT(here, pc);
  838. }
  839. break;
  840. case OQUEST_: /* two branches, both forward */
  841. FWD(aft, aft, 1);
  842. FWD(aft, aft, OPND(s));
  843. break;
  844. case O_QUEST: /* just an empty */
  845. FWD(aft, aft, 1);
  846. break;
  847. case OLPAREN: /* not significant here */
  848. case ORPAREN:
  849. FWD(aft, aft, 1);
  850. break;
  851. case OCH_: /* mark the first two branches */
  852. FWD(aft, aft, 1);
  853. assert(OP(g->strip[pc+OPND(s)]) == OOR2);
  854. FWD(aft, aft, OPND(s));
  855. break;
  856. case OOR1: /* done a branch, find the O_CH */
  857. if (ISSTATEIN(aft, here)) {
  858. for (look = 1;
  859. OP(s = g->strip[pc+look]) != O_CH;
  860. look += OPND(s))
  861. assert(OP(s) == OOR2);
  862. FWD(aft, aft, look);
  863. }
  864. break;
  865. case OOR2: /* propagate OCH_'s marking */
  866. FWD(aft, aft, 1);
  867. if (OP(g->strip[pc+OPND(s)]) != O_CH) {
  868. assert(OP(g->strip[pc+OPND(s)]) == OOR2);
  869. FWD(aft, aft, OPND(s));
  870. }
  871. break;
  872. case O_CH: /* just empty */
  873. FWD(aft, aft, 1);
  874. break;
  875. default: /* ooooops... */
  876. assert(PHP_REGEX_NOPE);
  877. break;
  878. }
  879. }
  880. return(aft);
  881. }
  882. #ifdef REDEBUG
  883. /*
  884. - print - print a set of states
  885. == #ifdef REDEBUG
  886. == static void print(struct match *m, unsigned char *caption, states st, \
  887. == int ch, FILE *d);
  888. == #endif
  889. */
  890. static void
  891. print(m, caption, st, ch, d)
  892. struct match *m;
  893. unsigned char *caption;
  894. states st;
  895. int ch;
  896. FILE *d;
  897. {
  898. register struct re_guts *g = m->g;
  899. register int i;
  900. register int first = 1;
  901. if (!(m->eflags&REG_TRACE))
  902. return;
  903. fprintf(d, "%s", caption);
  904. if (ch != '\0')
  905. fprintf(d, " %s", pchar(ch));
  906. for (i = 0; i < g->nstates; i++)
  907. if (ISSET(st, i)) {
  908. fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
  909. first = 0;
  910. }
  911. fprintf(d, "\n");
  912. }
  913. /*
  914. - at - print current situation
  915. == #ifdef REDEBUG
  916. == static void at(struct match *m, unsigned char *title, unsigned char *start, unsigned char *stop, \
  917. == sopno startst, sopno stopst);
  918. == #endif
  919. */
  920. static void
  921. at(m, title, start, stop, startst, stopst)
  922. struct match *m;
  923. unsigned char *title;
  924. unsigned char *start;
  925. unsigned char *stop;
  926. sopno startst;
  927. sopno stopst;
  928. {
  929. if (!(m->eflags&REG_TRACE))
  930. return;
  931. printf("%s %s-", title, pchar(*start));
  932. printf("%s ", pchar(*stop));
  933. printf("%ld-%ld\n", (long)startst, (long)stopst);
  934. }
  935. #ifndef PCHARDONE
  936. #define PCHARDONE /* never again */
  937. /*
  938. - pchar - make a character printable
  939. == #ifdef REDEBUG
  940. == static unsigned char *pchar(int ch);
  941. == #endif
  942. *
  943. * Is this identical to regchar() over in debug.c? Well, yes. But a
  944. * duplicate here avoids having a debugging-capable regexec.o tied to
  945. * a matching debug.o, and this is convenient. It all disappears in
  946. * the non-debug compilation anyway, so it doesn't matter much.
  947. */
  948. static unsigned char * /* -> representation */
  949. pchar(ch)
  950. int ch;
  951. {
  952. static unsigned char pbuf[10];
  953. if (isprint(ch) || ch == ' ')
  954. sprintf(pbuf, "%c", ch);
  955. else
  956. sprintf(pbuf, "\\%o", ch);
  957. return(pbuf);
  958. }
  959. #endif
  960. #endif
  961. #undef matcher
  962. #undef fast
  963. #undef slow
  964. #undef dissect
  965. #undef backref
  966. #undef step
  967. #undef print
  968. #undef at
  969. #undef match