regcomp.c 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625
  1. #include <sys/types.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <ctype.h>
  5. #include <limits.h>
  6. #include <stdlib.h>
  7. #define POSIX_MISTAKE
  8. #include "utils.h"
  9. #include "regex.h"
  10. #include "regex2.h"
  11. #include "cclass.h"
  12. #include "cname.h"
  13. /*
  14. * parse structure, passed up and down to avoid global variables and
  15. * other clumsinesses
  16. */
  17. struct parse {
  18. unsigned char *next; /* next character in RE */
  19. unsigned char *end; /* end of string (-> NUL normally) */
  20. int error; /* has an error been seen? */
  21. sop *strip; /* malloced strip */
  22. sopno ssize; /* malloced strip size (allocated) */
  23. sopno slen; /* malloced strip length (used) */
  24. int ncsalloc; /* number of csets allocated */
  25. struct re_guts *g;
  26. # define NPAREN 10 /* we need to remember () 1-9 for back refs */
  27. sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
  28. sopno pend[NPAREN]; /* -> ) ([0] unused) */
  29. };
  30. #include "regcomp.ih"
  31. static unsigned char nuls[10]; /* place to point scanner in event of error */
  32. /*
  33. * macros for use with parse structure
  34. * BEWARE: these know that the parse structure is named `p' !!!
  35. */
  36. #define PEEK() (*p->next)
  37. #define PEEK2() (*(p->next+1))
  38. #define MORE() (p->next < p->end)
  39. #define MORE2() (p->next+1 < p->end)
  40. #define SEE(c) (MORE() && PEEK() == (c))
  41. #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
  42. #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
  43. #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
  44. #define NEXT() (p->next++)
  45. #define NEXT2() (p->next += 2)
  46. #define NEXTn(n) (p->next += (n))
  47. #define GETNEXT() (*p->next++)
  48. #define SETERROR(e) seterr(p, (e))
  49. #define REQUIRE(co, e) (void) ((co) || SETERROR(e))
  50. #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
  51. #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
  52. #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
  53. #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
  54. #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
  55. #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
  56. #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
  57. #define HERE() (p->slen)
  58. #define THERE() (p->slen - 1)
  59. #define THERETHERE() (p->slen - 2)
  60. #define DROP(n) (p->slen -= (n))
  61. #ifndef NDEBUG
  62. static int never = 0; /* for use in asserts; shuts lint up */
  63. #else
  64. #define never 0 /* some <assert.h>s have bugs too */
  65. #endif
  66. /*
  67. - regcomp - interface for parser and compilation
  68. = API_EXPORT(int) regcomp(regex_t *, const char *, int);
  69. = #define REG_BASIC 0000
  70. = #define REG_EXTENDED 0001
  71. = #define REG_ICASE 0002
  72. = #define REG_NOSUB 0004
  73. = #define REG_NEWLINE 0010
  74. = #define REG_NOSPEC 0020
  75. = #define REG_PEND 0040
  76. = #define REG_DUMP 0200
  77. */
  78. API_EXPORT(int) /* 0 success, otherwise REG_something */
  79. regcomp(preg, pattern, cflags)
  80. regex_t *preg;
  81. const char *pattern;
  82. int cflags;
  83. {
  84. struct parse pa;
  85. register struct re_guts *g;
  86. register struct parse *p = &pa;
  87. register int i;
  88. register size_t len;
  89. #ifdef REDEBUG
  90. # define GOODFLAGS(f) (f)
  91. #else
  92. # define GOODFLAGS(f) ((f)&~REG_DUMP)
  93. #endif
  94. cflags = GOODFLAGS(cflags);
  95. if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
  96. return(REG_INVARG);
  97. if (cflags&REG_PEND) {
  98. if (preg->re_endp < pattern)
  99. return(REG_INVARG);
  100. len = preg->re_endp - pattern;
  101. } else
  102. len = strlen((char *)pattern);
  103. /* do the mallocs early so failure handling is easy */
  104. g = (struct re_guts *)malloc(sizeof(struct re_guts) +
  105. (NC-1)*sizeof(cat_t));
  106. if (g == NULL)
  107. return(REG_ESPACE);
  108. {
  109. /* Patched for CERT Vulnerability Note VU#695940, Feb 2015. */
  110. size_t new_ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
  111. if (new_ssize < len || new_ssize > LONG_MAX / sizeof(sop)) {
  112. free((char *) g);
  113. return REG_INVARG;
  114. }
  115. p->ssize = new_ssize;
  116. }
  117. p->strip = (sop *)malloc(p->ssize * sizeof(sop));
  118. p->slen = 0;
  119. if (p->strip == NULL) {
  120. free((char *)g);
  121. return(REG_ESPACE);
  122. }
  123. /* set things up */
  124. p->g = g;
  125. p->next = (unsigned char *)pattern; /* convenience; we do not modify it */
  126. p->end = p->next + len;
  127. p->error = 0;
  128. p->ncsalloc = 0;
  129. for (i = 0; i < NPAREN; i++) {
  130. p->pbegin[i] = 0;
  131. p->pend[i] = 0;
  132. }
  133. g->csetsize = NC;
  134. g->sets = NULL;
  135. g->setbits = NULL;
  136. g->ncsets = 0;
  137. g->cflags = cflags;
  138. g->iflags = 0;
  139. g->nbol = 0;
  140. g->neol = 0;
  141. g->must = NULL;
  142. g->mlen = 0;
  143. g->nsub = 0;
  144. g->ncategories = 1; /* category 0 is "everything else" */
  145. g->categories = &g->catspace[0];
  146. (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
  147. g->backrefs = 0;
  148. /* do it */
  149. EMIT(OEND, 0);
  150. g->firststate = THERE();
  151. if (cflags&REG_EXTENDED)
  152. p_ere(p, OUT);
  153. else if (cflags&REG_NOSPEC)
  154. p_str(p);
  155. else
  156. p_bre(p, OUT, OUT);
  157. EMIT(OEND, 0);
  158. g->laststate = THERE();
  159. /* tidy up loose ends and fill things in */
  160. categorize(p, g);
  161. stripsnug(p, g);
  162. findmust(p, g);
  163. g->nplus = pluscount(p, g);
  164. g->magic = MAGIC2;
  165. preg->re_nsub = g->nsub;
  166. preg->re_g = g;
  167. preg->re_magic = MAGIC1;
  168. #ifndef REDEBUG
  169. /* not debugging, so can't rely on the assert() in regexec() */
  170. if (g->iflags&BAD)
  171. SETERROR(REG_ASSERT);
  172. #endif
  173. /* win or lose, we're done */
  174. if (p->error != 0) /* lose */
  175. regfree(preg);
  176. return(p->error);
  177. }
  178. /*
  179. - p_ere - ERE parser top level, concatenation and alternation
  180. == static void p_ere(register struct parse *p, int stop);
  181. */
  182. static void
  183. p_ere(p, stop)
  184. register struct parse *p;
  185. int stop; /* character this ERE should end at */
  186. {
  187. register unsigned char c;
  188. register sopno prevback = 0;
  189. register sopno prevfwd = 0;
  190. register sopno conc;
  191. register int first = 1; /* is this the first alternative? */
  192. for (;;) {
  193. /* do a bunch of concatenated expressions */
  194. conc = HERE();
  195. while (MORE() && (c = PEEK()) != '|' && c != stop)
  196. p_ere_exp(p);
  197. (void) REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
  198. if (!EAT('|'))
  199. break; /* NOTE BREAK OUT */
  200. if (first) {
  201. INSERT(OCH_, conc); /* offset is wrong */
  202. prevfwd = conc;
  203. prevback = conc;
  204. first = 0;
  205. }
  206. ASTERN(OOR1, prevback);
  207. prevback = THERE();
  208. AHEAD(prevfwd); /* fix previous offset */
  209. prevfwd = HERE();
  210. EMIT(OOR2, 0); /* offset is very wrong */
  211. }
  212. if (!first) { /* tail-end fixups */
  213. AHEAD(prevfwd);
  214. ASTERN(O_CH, prevback);
  215. }
  216. assert(!MORE() || SEE(stop));
  217. }
  218. /*
  219. - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
  220. == static void p_ere_exp(register struct parse *p);
  221. */
  222. static void
  223. p_ere_exp(p)
  224. register struct parse *p;
  225. {
  226. register unsigned char c;
  227. register sopno pos;
  228. register int count;
  229. register int count2;
  230. register sopno subno;
  231. int wascaret = 0;
  232. assert(MORE()); /* caller should have ensured this */
  233. c = GETNEXT();
  234. pos = HERE();
  235. switch (c) {
  236. case '(':
  237. REQUIRE(MORE(), REG_EPAREN);
  238. p->g->nsub++;
  239. subno = p->g->nsub;
  240. if (subno < NPAREN)
  241. p->pbegin[subno] = HERE();
  242. EMIT(OLPAREN, subno);
  243. if (!SEE(')'))
  244. p_ere(p, ')');
  245. if (subno < NPAREN) {
  246. p->pend[subno] = HERE();
  247. assert(p->pend[subno] != 0);
  248. }
  249. EMIT(ORPAREN, subno);
  250. MUSTEAT(')', REG_EPAREN);
  251. break;
  252. #ifndef POSIX_MISTAKE
  253. case ')': /* happens only if no current unmatched ( */
  254. /*
  255. * You may ask, why the ifndef? Because I didn't notice
  256. * this until slightly too late for 1003.2, and none of the
  257. * other 1003.2 regular-expression reviewers noticed it at
  258. * all. So an unmatched ) is legal POSIX, at least until
  259. * we can get it fixed.
  260. */
  261. SETERROR(REG_EPAREN);
  262. break;
  263. #endif
  264. case '^':
  265. EMIT(OBOL, 0);
  266. p->g->iflags |= USEBOL;
  267. p->g->nbol++;
  268. wascaret = 1;
  269. break;
  270. case '$':
  271. EMIT(OEOL, 0);
  272. p->g->iflags |= USEEOL;
  273. p->g->neol++;
  274. break;
  275. case '|':
  276. SETERROR(REG_EMPTY);
  277. break;
  278. case '*':
  279. case '+':
  280. case '?':
  281. SETERROR(REG_BADRPT);
  282. break;
  283. case '.':
  284. if (p->g->cflags&REG_NEWLINE)
  285. nonnewline(p);
  286. else
  287. EMIT(OANY, 0);
  288. break;
  289. case '[':
  290. p_bracket(p);
  291. break;
  292. case '\\':
  293. REQUIRE(MORE(), REG_EESCAPE);
  294. c = GETNEXT();
  295. ordinary(p, c);
  296. break;
  297. case '{': /* okay as ordinary except if digit follows */
  298. REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
  299. /* FALLTHROUGH */
  300. default:
  301. ordinary(p, c);
  302. break;
  303. }
  304. if (!MORE())
  305. return;
  306. c = PEEK();
  307. /* we call { a repetition if followed by a digit */
  308. if (!( c == '*' || c == '+' || c == '?' ||
  309. (c == '{' && MORE2() && isdigit(PEEK2())) ))
  310. return; /* no repetition, we're done */
  311. NEXT();
  312. REQUIRE(!wascaret, REG_BADRPT);
  313. switch (c) {
  314. case '*': /* implemented as +? */
  315. /* this case does not require the (y|) trick, noKLUDGE */
  316. INSERT(OPLUS_, pos);
  317. ASTERN(O_PLUS, pos);
  318. INSERT(OQUEST_, pos);
  319. ASTERN(O_QUEST, pos);
  320. break;
  321. case '+':
  322. INSERT(OPLUS_, pos);
  323. ASTERN(O_PLUS, pos);
  324. break;
  325. case '?':
  326. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  327. INSERT(OCH_, pos); /* offset slightly wrong */
  328. ASTERN(OOR1, pos); /* this one's right */
  329. AHEAD(pos); /* fix the OCH_ */
  330. EMIT(OOR2, 0); /* offset very wrong... */
  331. AHEAD(THERE()); /* ...so fix it */
  332. ASTERN(O_CH, THERETHERE());
  333. break;
  334. case '{':
  335. count = p_count(p);
  336. if (EAT(',')) {
  337. if (isdigit(PEEK())) {
  338. count2 = p_count(p);
  339. REQUIRE(count <= count2, REG_BADBR);
  340. } else /* single number with comma */
  341. count2 = INFINITY;
  342. } else /* just a single number */
  343. count2 = count;
  344. repeat(p, pos, count, count2);
  345. if (!EAT('}')) { /* error heuristics */
  346. while (MORE() && PEEK() != '}')
  347. NEXT();
  348. REQUIRE(MORE(), REG_EBRACE);
  349. SETERROR(REG_BADBR);
  350. }
  351. break;
  352. }
  353. if (!MORE())
  354. return;
  355. c = PEEK();
  356. if (!( c == '*' || c == '+' || c == '?' ||
  357. (c == '{' && MORE2() && isdigit(PEEK2())) ) )
  358. return;
  359. SETERROR(REG_BADRPT);
  360. }
  361. /*
  362. - p_str - string (no metacharacters) "parser"
  363. == static void p_str(register struct parse *p);
  364. */
  365. static void
  366. p_str(p)
  367. register struct parse *p;
  368. {
  369. REQUIRE(MORE(), REG_EMPTY);
  370. while (MORE())
  371. ordinary(p, GETNEXT());
  372. }
  373. /*
  374. - p_bre - BRE parser top level, anchoring and concatenation
  375. == static void p_bre(register struct parse *p, register int end1, \
  376. == register int end2);
  377. * Giving end1 as OUT essentially eliminates the end1/end2 check.
  378. *
  379. * This implementation is a bit of a kludge, in that a trailing $ is first
  380. * taken as an ordinary character and then revised to be an anchor. The
  381. * only undesirable side effect is that '$' gets included as a character
  382. * category in such cases. This is fairly harmless; not worth fixing.
  383. * The amount of lookahead needed to avoid this kludge is excessive.
  384. */
  385. static void
  386. p_bre(p, end1, end2)
  387. register struct parse *p;
  388. register int end1; /* first terminating character */
  389. register int end2; /* second terminating character */
  390. {
  391. register sopno start = HERE();
  392. register int first = 1; /* first subexpression? */
  393. register int wasdollar = 0;
  394. if (EAT('^')) {
  395. EMIT(OBOL, 0);
  396. p->g->iflags |= USEBOL;
  397. p->g->nbol++;
  398. }
  399. while (MORE() && !SEETWO(end1, end2)) {
  400. wasdollar = p_simp_re(p, first);
  401. first = 0;
  402. }
  403. if (wasdollar) { /* oops, that was a trailing anchor */
  404. DROP(1);
  405. EMIT(OEOL, 0);
  406. p->g->iflags |= USEEOL;
  407. p->g->neol++;
  408. }
  409. REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
  410. }
  411. /*
  412. - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
  413. == static int p_simp_re(register struct parse *p, int starordinary);
  414. */
  415. static int /* was the simple RE an unbackslashed $? */
  416. p_simp_re(p, starordinary)
  417. register struct parse *p;
  418. int starordinary; /* is a leading * an ordinary character? */
  419. {
  420. register int c;
  421. register int count;
  422. register int count2;
  423. register sopno pos;
  424. register int i;
  425. register sopno subno;
  426. # define BACKSL (1<<CHAR_BIT)
  427. pos = HERE(); /* repetion op, if any, covers from here */
  428. assert(MORE()); /* caller should have ensured this */
  429. c = GETNEXT();
  430. if (c == '\\') {
  431. REQUIRE(MORE(), REG_EESCAPE);
  432. c = BACKSL | (unsigned char)GETNEXT();
  433. }
  434. switch (c) {
  435. case '.':
  436. if (p->g->cflags&REG_NEWLINE)
  437. nonnewline(p);
  438. else
  439. EMIT(OANY, 0);
  440. break;
  441. case '[':
  442. p_bracket(p);
  443. break;
  444. case BACKSL|'{':
  445. SETERROR(REG_BADRPT);
  446. break;
  447. case BACKSL|'(':
  448. p->g->nsub++;
  449. subno = p->g->nsub;
  450. if (subno < NPAREN)
  451. p->pbegin[subno] = HERE();
  452. EMIT(OLPAREN, subno);
  453. /* the MORE here is an error heuristic */
  454. if (MORE() && !SEETWO('\\', ')'))
  455. p_bre(p, '\\', ')');
  456. if (subno < NPAREN) {
  457. p->pend[subno] = HERE();
  458. assert(p->pend[subno] != 0);
  459. }
  460. EMIT(ORPAREN, subno);
  461. REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
  462. break;
  463. case BACKSL|')': /* should not get here -- must be user */
  464. case BACKSL|'}':
  465. SETERROR(REG_EPAREN);
  466. break;
  467. case BACKSL|'1':
  468. case BACKSL|'2':
  469. case BACKSL|'3':
  470. case BACKSL|'4':
  471. case BACKSL|'5':
  472. case BACKSL|'6':
  473. case BACKSL|'7':
  474. case BACKSL|'8':
  475. case BACKSL|'9':
  476. i = (c&~BACKSL) - '0';
  477. assert(i < NPAREN);
  478. if (p->pend[i] != 0) {
  479. assert(i <= p->g->nsub);
  480. EMIT(OBACK_, i);
  481. assert(p->pbegin[i] != 0);
  482. assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
  483. assert(OP(p->strip[p->pend[i]]) == ORPAREN);
  484. (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
  485. EMIT(O_BACK, i);
  486. } else
  487. SETERROR(REG_ESUBREG);
  488. p->g->backrefs = 1;
  489. break;
  490. case '*':
  491. REQUIRE(starordinary, REG_BADRPT);
  492. /* FALLTHROUGH */
  493. default:
  494. ordinary(p, (unsigned char)c); /* takes off BACKSL, if any */
  495. break;
  496. }
  497. if (EAT('*')) { /* implemented as +? */
  498. /* this case does not require the (y|) trick, noKLUDGE */
  499. INSERT(OPLUS_, pos);
  500. ASTERN(O_PLUS, pos);
  501. INSERT(OQUEST_, pos);
  502. ASTERN(O_QUEST, pos);
  503. } else if (EATTWO('\\', '{')) {
  504. count = p_count(p);
  505. if (EAT(',')) {
  506. if (MORE() && isdigit(PEEK())) {
  507. count2 = p_count(p);
  508. REQUIRE(count <= count2, REG_BADBR);
  509. } else /* single number with comma */
  510. count2 = INFINITY;
  511. } else /* just a single number */
  512. count2 = count;
  513. repeat(p, pos, count, count2);
  514. if (!EATTWO('\\', '}')) { /* error heuristics */
  515. while (MORE() && !SEETWO('\\', '}'))
  516. NEXT();
  517. REQUIRE(MORE(), REG_EBRACE);
  518. SETERROR(REG_BADBR);
  519. }
  520. } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
  521. return(1);
  522. return(0);
  523. }
  524. /*
  525. - p_count - parse a repetition count
  526. == static int p_count(register struct parse *p);
  527. */
  528. static int /* the value */
  529. p_count(p)
  530. register struct parse *p;
  531. {
  532. register int count = 0;
  533. register int ndigits = 0;
  534. while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
  535. count = count*10 + (GETNEXT() - '0');
  536. ndigits++;
  537. }
  538. REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
  539. return(count);
  540. }
  541. /*
  542. - p_bracket - parse a bracketed character list
  543. == static void p_bracket(register struct parse *p);
  544. *
  545. * Note a significant property of this code: if the allocset() did SETERROR,
  546. * no set operations are done.
  547. */
  548. static void
  549. p_bracket(p)
  550. register struct parse *p;
  551. {
  552. register cset *cs = allocset(p);
  553. register int invert = 0;
  554. /* Dept of Truly Sickening Special-Case Kludges */
  555. if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
  556. EMIT(OBOW, 0);
  557. NEXTn(6);
  558. return;
  559. }
  560. if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
  561. EMIT(OEOW, 0);
  562. NEXTn(6);
  563. return;
  564. }
  565. if (EAT('^'))
  566. invert++; /* make note to invert set at end */
  567. if (EAT(']'))
  568. CHadd(cs, ']');
  569. else if (EAT('-'))
  570. CHadd(cs, '-');
  571. while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
  572. p_b_term(p, cs);
  573. if (EAT('-'))
  574. CHadd(cs, '-');
  575. MUSTEAT(']', REG_EBRACK);
  576. if (p->error != 0) /* don't mess things up further */
  577. return;
  578. if (p->g->cflags&REG_ICASE) {
  579. register int i;
  580. register int ci;
  581. for (i = p->g->csetsize - 1; i >= 0; i--)
  582. if (CHIN(cs, i) && isalpha(i)) {
  583. ci = othercase(i);
  584. if (ci != i)
  585. CHadd(cs, ci);
  586. }
  587. if (cs->multis != NULL)
  588. mccase(p, cs);
  589. }
  590. if (invert) {
  591. register int i;
  592. for (i = p->g->csetsize - 1; i >= 0; i--)
  593. if (CHIN(cs, i))
  594. CHsub(cs, i);
  595. else
  596. CHadd(cs, i);
  597. if (p->g->cflags&REG_NEWLINE)
  598. CHsub(cs, '\n');
  599. if (cs->multis != NULL)
  600. mcinvert(p, cs);
  601. }
  602. assert(cs->multis == NULL); /* xxx */
  603. if (nch(p, cs) == 1) { /* optimize singleton sets */
  604. ordinary(p, firstch(p, cs));
  605. freeset(p, cs);
  606. } else
  607. EMIT(OANYOF, freezeset(p, cs));
  608. }
  609. /*
  610. - p_b_term - parse one term of a bracketed character list
  611. == static void p_b_term(register struct parse *p, register cset *cs);
  612. */
  613. static void
  614. p_b_term(p, cs)
  615. register struct parse *p;
  616. register cset *cs;
  617. {
  618. register unsigned char c;
  619. register unsigned char start, finish;
  620. register int i;
  621. /* classify what we've got */
  622. switch ((MORE()) ? PEEK() : '\0') {
  623. case '[':
  624. c = (MORE2()) ? PEEK2() : '\0';
  625. break;
  626. case '-':
  627. SETERROR(REG_ERANGE);
  628. return; /* NOTE RETURN */
  629. break;
  630. default:
  631. c = '\0';
  632. break;
  633. }
  634. switch (c) {
  635. case ':': /* character class */
  636. NEXT2();
  637. REQUIRE(MORE(), REG_EBRACK);
  638. c = PEEK();
  639. REQUIRE(c != '-' && c != ']', REG_ECTYPE);
  640. p_b_cclass(p, cs);
  641. REQUIRE(MORE(), REG_EBRACK);
  642. REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
  643. break;
  644. case '=': /* equivalence class */
  645. NEXT2();
  646. REQUIRE(MORE(), REG_EBRACK);
  647. c = PEEK();
  648. REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
  649. p_b_eclass(p, cs);
  650. REQUIRE(MORE(), REG_EBRACK);
  651. REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
  652. break;
  653. default: /* symbol, ordinary character, or range */
  654. /* xxx revision needed for multichar stuff */
  655. start = p_b_symbol(p);
  656. if (SEE('-') && MORE2() && PEEK2() != ']') {
  657. /* range */
  658. NEXT();
  659. if (EAT('-'))
  660. finish = '-';
  661. else
  662. finish = p_b_symbol(p);
  663. } else
  664. finish = start;
  665. /* xxx what about signed chars here... */
  666. REQUIRE(start <= finish, REG_ERANGE);
  667. for (i = start; i <= finish; i++)
  668. CHadd(cs, i);
  669. break;
  670. }
  671. }
  672. /*
  673. - p_b_cclass - parse a character-class name and deal with it
  674. == static void p_b_cclass(register struct parse *p, register cset *cs);
  675. */
  676. static void
  677. p_b_cclass(p, cs)
  678. register struct parse *p;
  679. register cset *cs;
  680. {
  681. register unsigned char *sp = p->next;
  682. register const struct cclass *cp;
  683. register size_t len;
  684. register const unsigned char *u;
  685. register unsigned char c;
  686. while (MORE() && isalpha(PEEK()))
  687. NEXT();
  688. len = p->next - sp;
  689. for (cp = cclasses; cp->name != NULL; cp++)
  690. if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
  691. break;
  692. if (cp->name == NULL) {
  693. /* oops, didn't find it */
  694. SETERROR(REG_ECTYPE);
  695. return;
  696. }
  697. u = cp->chars;
  698. while ((c = *u++) != '\0')
  699. CHadd(cs, c);
  700. for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
  701. MCadd(p, cs, u);
  702. }
  703. /*
  704. - p_b_eclass - parse an equivalence-class name and deal with it
  705. == static void p_b_eclass(register struct parse *p, register cset *cs);
  706. *
  707. * This implementation is incomplete. xxx
  708. */
  709. static void
  710. p_b_eclass(p, cs)
  711. register struct parse *p;
  712. register cset *cs;
  713. {
  714. register unsigned char c;
  715. c = p_b_coll_elem(p, '=');
  716. CHadd(cs, c);
  717. }
  718. /*
  719. - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
  720. == static char p_b_symbol(register struct parse *p);
  721. */
  722. static unsigned char /* value of symbol */
  723. p_b_symbol(p)
  724. register struct parse *p;
  725. {
  726. register unsigned char value;
  727. REQUIRE(MORE(), REG_EBRACK);
  728. if (!EATTWO('[', '.'))
  729. return(GETNEXT());
  730. /* collating symbol */
  731. value = p_b_coll_elem(p, '.');
  732. REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
  733. return(value);
  734. }
  735. /*
  736. - p_b_coll_elem - parse a collating-element name and look it up
  737. == static char p_b_coll_elem(register struct parse *p, int endc);
  738. */
  739. static unsigned char /* value of collating element */
  740. p_b_coll_elem(p, endc)
  741. register struct parse *p;
  742. int endc; /* name ended by endc,']' */
  743. {
  744. register unsigned char *sp = p->next;
  745. register const struct cname *cp;
  746. register int len;
  747. while (MORE() && !SEETWO(endc, ']'))
  748. NEXT();
  749. if (!MORE()) {
  750. SETERROR(REG_EBRACK);
  751. return(0);
  752. }
  753. len = p->next - sp;
  754. for (cp = cnames; cp->name != NULL; cp++)
  755. if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
  756. return(cp->code); /* known name */
  757. if (len == 1)
  758. return(*sp); /* single character */
  759. SETERROR(REG_ECOLLATE); /* neither */
  760. return(0);
  761. }
  762. /*
  763. - othercase - return the case counterpart of an alphabetic
  764. == static char othercase(int ch);
  765. */
  766. static unsigned char /* if no counterpart, return ch */
  767. othercase(ch)
  768. int ch;
  769. {
  770. assert(isalpha(ch));
  771. if (isupper(ch))
  772. return(tolower(ch));
  773. else if (islower(ch))
  774. return(toupper(ch));
  775. else /* peculiar, but could happen */
  776. return(ch);
  777. }
  778. /*
  779. - bothcases - emit a dualcase version of a two-case character
  780. == static void bothcases(register struct parse *p, int ch);
  781. *
  782. * Boy, is this implementation ever a kludge...
  783. */
  784. static void
  785. bothcases(p, ch)
  786. register struct parse *p;
  787. int ch;
  788. {
  789. register unsigned char *oldnext = p->next;
  790. register unsigned char *oldend = p->end;
  791. unsigned char bracket[3];
  792. assert(othercase(ch) != ch); /* p_bracket() would recurse */
  793. p->next = bracket;
  794. p->end = bracket+2;
  795. bracket[0] = ch;
  796. bracket[1] = ']';
  797. bracket[2] = '\0';
  798. p_bracket(p);
  799. assert(p->next == bracket+2);
  800. p->next = oldnext;
  801. p->end = oldend;
  802. }
  803. /*
  804. - ordinary - emit an ordinary character
  805. == static void ordinary(register struct parse *p, register int ch);
  806. */
  807. static void
  808. ordinary(p, ch)
  809. register struct parse *p;
  810. register int ch;
  811. {
  812. register cat_t *cap = p->g->categories;
  813. if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
  814. bothcases(p, ch);
  815. else {
  816. EMIT(OCHAR, (unsigned char)ch);
  817. if (cap[ch] == 0)
  818. cap[ch] = p->g->ncategories++;
  819. }
  820. }
  821. /*
  822. - nonnewline - emit REG_NEWLINE version of OANY
  823. == static void nonnewline(register struct parse *p);
  824. *
  825. * Boy, is this implementation ever a kludge...
  826. */
  827. static void
  828. nonnewline(p)
  829. register struct parse *p;
  830. {
  831. register unsigned char *oldnext = p->next;
  832. register unsigned char *oldend = p->end;
  833. unsigned char bracket[4];
  834. p->next = bracket;
  835. p->end = bracket+3;
  836. bracket[0] = '^';
  837. bracket[1] = '\n';
  838. bracket[2] = ']';
  839. bracket[3] = '\0';
  840. p_bracket(p);
  841. assert(p->next == bracket+3);
  842. p->next = oldnext;
  843. p->end = oldend;
  844. }
  845. /*
  846. - repeat - generate code for a bounded repetition, recursively if needed
  847. == static void repeat(register struct parse *p, sopno start, int from, int to);
  848. */
  849. static void
  850. repeat(p, start, from, to)
  851. register struct parse *p;
  852. sopno start; /* operand from here to end of strip */
  853. int from; /* repeated from this number */
  854. int to; /* to this number of times (maybe INFINITY) */
  855. {
  856. register sopno finish = HERE();
  857. # define N 2
  858. # define INF 3
  859. # define REP(f, t) ((f)*8 + (t))
  860. # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
  861. register sopno copy;
  862. if (p->error != 0) /* head off possible runaway recursion */
  863. return;
  864. assert(from <= to);
  865. switch (REP(MAP(from), MAP(to))) {
  866. case REP(0, 0): /* must be user doing this */
  867. DROP(finish-start); /* drop the operand */
  868. break;
  869. case REP(0, 1): /* as x{1,1}? */
  870. case REP(0, N): /* as x{1,n}? */
  871. case REP(0, INF): /* as x{1,}? */
  872. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  873. INSERT(OCH_, start); /* offset is wrong... */
  874. repeat(p, start+1, 1, to);
  875. ASTERN(OOR1, start);
  876. AHEAD(start); /* ... fix it */
  877. EMIT(OOR2, 0);
  878. AHEAD(THERE());
  879. ASTERN(O_CH, THERETHERE());
  880. break;
  881. case REP(1, 1): /* trivial case */
  882. /* done */
  883. break;
  884. case REP(1, N): /* as x?x{1,n-1} */
  885. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  886. INSERT(OCH_, start);
  887. ASTERN(OOR1, start);
  888. AHEAD(start);
  889. EMIT(OOR2, 0); /* offset very wrong... */
  890. AHEAD(THERE()); /* ...so fix it */
  891. ASTERN(O_CH, THERETHERE());
  892. copy = dupl(p, start+1, finish+1);
  893. assert(copy == finish+4);
  894. repeat(p, copy, 1, to-1);
  895. break;
  896. case REP(1, INF): /* as x+ */
  897. INSERT(OPLUS_, start);
  898. ASTERN(O_PLUS, start);
  899. break;
  900. case REP(N, N): /* as xx{m-1,n-1} */
  901. copy = dupl(p, start, finish);
  902. repeat(p, copy, from-1, to-1);
  903. break;
  904. case REP(N, INF): /* as xx{n-1,INF} */
  905. copy = dupl(p, start, finish);
  906. repeat(p, copy, from-1, to);
  907. break;
  908. default: /* "can't happen" */
  909. SETERROR(REG_ASSERT); /* just in case */
  910. break;
  911. }
  912. }
  913. /*
  914. - seterr - set an error condition
  915. == static int seterr(register struct parse *p, int e);
  916. */
  917. static int /* useless but makes type checking happy */
  918. seterr(p, e)
  919. register struct parse *p;
  920. int e;
  921. {
  922. if (p->error == 0) /* keep earliest error condition */
  923. p->error = e;
  924. p->next = nuls; /* try to bring things to a halt */
  925. p->end = nuls;
  926. return(0); /* make the return value well-defined */
  927. }
  928. /*
  929. - allocset - allocate a set of characters for []
  930. == static cset *allocset(register struct parse *p);
  931. */
  932. static cset *
  933. allocset(p)
  934. register struct parse *p;
  935. {
  936. register int no = p->g->ncsets++;
  937. register size_t nc;
  938. register size_t nbytes;
  939. register cset *cs;
  940. register size_t css = (size_t)p->g->csetsize;
  941. register int i;
  942. if (no >= p->ncsalloc) { /* need another column of space */
  943. p->ncsalloc += CHAR_BIT;
  944. nc = p->ncsalloc;
  945. assert(nc % CHAR_BIT == 0);
  946. nbytes = nc / CHAR_BIT * css;
  947. if (p->g->sets == NULL)
  948. p->g->sets = (cset *)malloc(nc * sizeof(cset));
  949. else
  950. p->g->sets = (cset *)realloc((unsigned char *)p->g->sets,
  951. nc * sizeof(cset));
  952. if (p->g->setbits == NULL)
  953. p->g->setbits = (uch *)malloc(nbytes);
  954. else {
  955. p->g->setbits = (uch *)realloc((unsigned char *)p->g->setbits,
  956. nbytes);
  957. /* xxx this isn't right if setbits is now NULL */
  958. for (i = 0; i < no; i++)
  959. p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
  960. }
  961. if (p->g->sets != NULL && p->g->setbits != NULL)
  962. (void) memset((unsigned char *)p->g->setbits + (nbytes - css),
  963. 0, css);
  964. else {
  965. no = 0;
  966. SETERROR(REG_ESPACE);
  967. /* caller's responsibility not to do set ops */
  968. }
  969. }
  970. assert(p->g->sets != NULL); /* xxx */
  971. cs = &p->g->sets[no];
  972. cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
  973. cs->mask = 1 << ((no) % CHAR_BIT);
  974. cs->hash = 0;
  975. cs->smultis = 0;
  976. cs->multis = NULL;
  977. return(cs);
  978. }
  979. /*
  980. - freeset - free a now-unused set
  981. == static void freeset(register struct parse *p, register cset *cs);
  982. */
  983. static void
  984. freeset(p, cs)
  985. register struct parse *p;
  986. register cset *cs;
  987. {
  988. register size_t i;
  989. register cset *top = &p->g->sets[p->g->ncsets];
  990. register size_t css = (size_t)p->g->csetsize;
  991. for (i = 0; i < css; i++)
  992. CHsub(cs, i);
  993. if (cs == top-1) /* recover only the easy case */
  994. p->g->ncsets--;
  995. }
  996. /*
  997. - freezeset - final processing on a set of characters
  998. == static int freezeset(register struct parse *p, register cset *cs);
  999. *
  1000. * The main task here is merging identical sets. This is usually a waste
  1001. * of time (although the hash code minimizes the overhead), but can win
  1002. * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
  1003. * is done using addition rather than xor -- all ASCII [aA] sets xor to
  1004. * the same value!
  1005. */
  1006. static int /* set number */
  1007. freezeset(p, cs)
  1008. register struct parse *p;
  1009. register cset *cs;
  1010. {
  1011. register uch h = cs->hash;
  1012. register size_t i;
  1013. register cset *top = &p->g->sets[p->g->ncsets];
  1014. register cset *cs2;
  1015. register size_t css = (size_t)p->g->csetsize;
  1016. /* look for an earlier one which is the same */
  1017. for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
  1018. if (cs2->hash == h && cs2 != cs) {
  1019. /* maybe */
  1020. for (i = 0; i < css; i++)
  1021. if (!!CHIN(cs2, i) != !!CHIN(cs, i))
  1022. break; /* no */
  1023. if (i == css)
  1024. break; /* yes */
  1025. }
  1026. if (cs2 < top) { /* found one */
  1027. freeset(p, cs);
  1028. cs = cs2;
  1029. }
  1030. return((int)(cs - p->g->sets));
  1031. }
  1032. /*
  1033. - firstch - return first character in a set (which must have at least one)
  1034. == static int firstch(register struct parse *p, register cset *cs);
  1035. */
  1036. static int /* character; there is no "none" value */
  1037. firstch(p, cs)
  1038. register struct parse *p;
  1039. register cset *cs;
  1040. {
  1041. register size_t i;
  1042. register size_t css = (size_t)p->g->csetsize;
  1043. for (i = 0; i < css; i++)
  1044. if (CHIN(cs, i))
  1045. return((unsigned char)i);
  1046. assert(never);
  1047. return(0); /* arbitrary */
  1048. }
  1049. /*
  1050. - nch - number of characters in a set
  1051. == static int nch(register struct parse *p, register cset *cs);
  1052. */
  1053. static int
  1054. nch(p, cs)
  1055. register struct parse *p;
  1056. register cset *cs;
  1057. {
  1058. register size_t i;
  1059. register size_t css = (size_t)p->g->csetsize;
  1060. register int n = 0;
  1061. for (i = 0; i < css; i++)
  1062. if (CHIN(cs, i))
  1063. n++;
  1064. return(n);
  1065. }
  1066. /*
  1067. - mcadd - add a collating element to a cset
  1068. == static void mcadd(register struct parse *p, register cset *cs, \
  1069. == register char *cp);
  1070. */
  1071. static void
  1072. mcadd(p, cs, cp)
  1073. register struct parse *p;
  1074. register cset *cs;
  1075. register const unsigned char *cp;
  1076. {
  1077. register size_t oldend = cs->smultis;
  1078. cs->smultis += strlen(cp) + 1;
  1079. if (cs->multis == NULL)
  1080. cs->multis = malloc(cs->smultis);
  1081. else
  1082. cs->multis = realloc(cs->multis, cs->smultis);
  1083. if (cs->multis == NULL) {
  1084. SETERROR(REG_ESPACE);
  1085. return;
  1086. }
  1087. (void) strcpy(cs->multis + oldend - 1, cp);
  1088. cs->multis[cs->smultis - 1] = '\0';
  1089. }
  1090. #if 0
  1091. /*
  1092. - mcsub - subtract a collating element from a cset
  1093. == static void mcsub(register cset *cs, register unsigned char *cp);
  1094. */
  1095. static void
  1096. mcsub(cs, cp)
  1097. register unsigned cset *cs;
  1098. register unsigned char *cp;
  1099. {
  1100. register unsigned char *fp = mcfind(cs, cp);
  1101. register size_t len = strlen(fp);
  1102. assert(fp != NULL);
  1103. (void) memmove(fp, fp + len + 1,
  1104. cs->smultis - (fp + len + 1 - cs->multis));
  1105. cs->smultis -= len;
  1106. if (cs->smultis == 0) {
  1107. free(cs->multis);
  1108. cs->multis = NULL;
  1109. return;
  1110. }
  1111. cs->multis = realloc(cs->multis, cs->smultis);
  1112. assert(cs->multis != NULL);
  1113. }
  1114. /*
  1115. - mcin - is a collating element in a cset?
  1116. == static int mcin(register cset *cs, register unsigned char *cp);
  1117. */
  1118. static int
  1119. mcin(cs, cp)
  1120. register cset *cs;
  1121. register unsigned char *cp;
  1122. {
  1123. return(mcfind(cs, cp) != NULL);
  1124. }
  1125. /*
  1126. - mcfind - find a collating element in a cset
  1127. == static unsigned char *mcfind(register cset *cs, register unsigned char *cp);
  1128. */
  1129. static unsigned char *
  1130. mcfind(cs, cp)
  1131. register cset *cs;
  1132. register unsigned char *cp;
  1133. {
  1134. register unsigned char *p;
  1135. if (cs->multis == NULL)
  1136. return(NULL);
  1137. for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
  1138. if (strcmp(cp, p) == 0)
  1139. return(p);
  1140. return(NULL);
  1141. }
  1142. #endif
  1143. /*
  1144. - mcinvert - invert the list of collating elements in a cset
  1145. == static void mcinvert(register struct parse *p, register cset *cs);
  1146. *
  1147. * This would have to know the set of possibilities. Implementation
  1148. * is deferred.
  1149. */
  1150. static void
  1151. mcinvert(p, cs)
  1152. register struct parse *p;
  1153. register cset *cs;
  1154. {
  1155. assert(cs->multis == NULL); /* xxx */
  1156. }
  1157. /*
  1158. - mccase - add case counterparts of the list of collating elements in a cset
  1159. == static void mccase(register struct parse *p, register cset *cs);
  1160. *
  1161. * This would have to know the set of possibilities. Implementation
  1162. * is deferred.
  1163. */
  1164. static void
  1165. mccase(p, cs)
  1166. register struct parse *p;
  1167. register cset *cs;
  1168. {
  1169. assert(cs->multis == NULL); /* xxx */
  1170. }
  1171. /*
  1172. - isinsets - is this character in any sets?
  1173. == static int isinsets(register struct re_guts *g, int c);
  1174. */
  1175. static int /* predicate */
  1176. isinsets(g, c)
  1177. register struct re_guts *g;
  1178. int c;
  1179. {
  1180. register uch *col;
  1181. register int i;
  1182. register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
  1183. register unsigned uc = (unsigned char)c;
  1184. if (!g->setbits) {
  1185. return(0);
  1186. }
  1187. for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
  1188. if (col[uc] != 0)
  1189. return(1);
  1190. return(0);
  1191. }
  1192. /*
  1193. - samesets - are these two characters in exactly the same sets?
  1194. == static int samesets(register struct re_guts *g, int c1, int c2);
  1195. */
  1196. static int /* predicate */
  1197. samesets(g, c1, c2)
  1198. register struct re_guts *g;
  1199. int c1;
  1200. int c2;
  1201. {
  1202. register uch *col;
  1203. register int i;
  1204. register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
  1205. register unsigned uc1 = (unsigned char)c1;
  1206. register unsigned uc2 = (unsigned char)c2;
  1207. for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
  1208. if (col[uc1] != col[uc2])
  1209. return(0);
  1210. return(1);
  1211. }
  1212. /*
  1213. - categorize - sort out character categories
  1214. == static void categorize(struct parse *p, register struct re_guts *g);
  1215. */
  1216. static void
  1217. categorize(p, g)
  1218. struct parse *p;
  1219. register struct re_guts *g;
  1220. {
  1221. register cat_t *cats = g->categories;
  1222. register int c;
  1223. register int c2;
  1224. register cat_t cat;
  1225. /* avoid making error situations worse */
  1226. if (p->error != 0)
  1227. return;
  1228. for (c = 0; c <= UCHAR_MAX; c++)
  1229. if (cats[c] == 0 && isinsets(g, c)) {
  1230. cat = g->ncategories++;
  1231. cats[c] = cat;
  1232. for (c2 = c+1; c2 <= UCHAR_MAX; c2++)
  1233. if (cats[c2] == 0 && samesets(g, c, c2))
  1234. cats[c2] = cat;
  1235. }
  1236. }
  1237. /*
  1238. - dupl - emit a duplicate of a bunch of sops
  1239. == static sopno dupl(register struct parse *p, sopno start, sopno finish);
  1240. */
  1241. static sopno /* start of duplicate */
  1242. dupl(p, start, finish)
  1243. register struct parse *p;
  1244. sopno start; /* from here */
  1245. sopno finish; /* to this less one */
  1246. {
  1247. register sopno ret = HERE();
  1248. register sopno len = finish - start;
  1249. assert(finish >= start);
  1250. if (len == 0)
  1251. return(ret);
  1252. enlarge(p, p->ssize + len); /* this many unexpected additions */
  1253. assert(p->ssize >= p->slen + len);
  1254. (void) memcpy((char *)(p->strip + p->slen),
  1255. (char *)(p->strip + start), (size_t)len*sizeof(sop));
  1256. p->slen += len;
  1257. return(ret);
  1258. }
  1259. /*
  1260. - doemit - emit a strip operator
  1261. == static void doemit(register struct parse *p, sop op, size_t opnd);
  1262. *
  1263. * It might seem better to implement this as a macro with a function as
  1264. * hard-case backup, but it's just too big and messy unless there are
  1265. * some changes to the data structures. Maybe later.
  1266. */
  1267. static void
  1268. doemit(p, op, opnd)
  1269. register struct parse *p;
  1270. sop op;
  1271. size_t opnd;
  1272. {
  1273. /* avoid making error situations worse */
  1274. if (p->error != 0)
  1275. return;
  1276. /* deal with oversize operands ("can't happen", more or less) */
  1277. assert(opnd < 1<<OPSHIFT);
  1278. /* deal with undersized strip */
  1279. if (p->slen >= p->ssize)
  1280. enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
  1281. assert(p->slen < p->ssize);
  1282. /* finally, it's all reduced to the easy case */
  1283. p->strip[p->slen++] = SOP(op, opnd);
  1284. }
  1285. /*
  1286. - doinsert - insert a sop into the strip
  1287. == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
  1288. */
  1289. static void
  1290. doinsert(p, op, opnd, pos)
  1291. register struct parse *p;
  1292. sop op;
  1293. size_t opnd;
  1294. sopno pos;
  1295. {
  1296. register sopno sn;
  1297. register sop s;
  1298. register int i;
  1299. /* avoid making error situations worse */
  1300. if (p->error != 0)
  1301. return;
  1302. sn = HERE();
  1303. EMIT(op, opnd); /* do checks, ensure space */
  1304. assert(HERE() == sn+1);
  1305. s = p->strip[sn];
  1306. /* adjust paren pointers */
  1307. assert(pos > 0);
  1308. for (i = 1; i < NPAREN; i++) {
  1309. if (p->pbegin[i] >= pos) {
  1310. p->pbegin[i]++;
  1311. }
  1312. if (p->pend[i] >= pos) {
  1313. p->pend[i]++;
  1314. }
  1315. }
  1316. memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
  1317. (HERE()-pos-1)*sizeof(sop));
  1318. p->strip[pos] = s;
  1319. }
  1320. /*
  1321. - dofwd - complete a forward reference
  1322. == static void dofwd(register struct parse *p, sopno pos, sop value);
  1323. */
  1324. static void
  1325. dofwd(p, pos, value)
  1326. register struct parse *p;
  1327. register sopno pos;
  1328. sop value;
  1329. {
  1330. /* avoid making error situations worse */
  1331. if (p->error != 0)
  1332. return;
  1333. assert(value < 1<<OPSHIFT);
  1334. p->strip[pos] = OP(p->strip[pos]) | value;
  1335. }
  1336. /*
  1337. - enlarge - enlarge the strip
  1338. == static void enlarge(register struct parse *p, sopno size);
  1339. */
  1340. static void
  1341. enlarge(p, size)
  1342. register struct parse *p;
  1343. register sopno size;
  1344. {
  1345. register sop *sp;
  1346. if (p->ssize >= size)
  1347. return;
  1348. sp = (sop *)realloc(p->strip, size*sizeof(sop));
  1349. if (sp == NULL) {
  1350. SETERROR(REG_ESPACE);
  1351. return;
  1352. }
  1353. p->strip = sp;
  1354. p->ssize = size;
  1355. }
  1356. /*
  1357. - stripsnug - compact the strip
  1358. == static void stripsnug(register struct parse *p, register struct re_guts *g);
  1359. */
  1360. static void
  1361. stripsnug(p, g)
  1362. register struct parse *p;
  1363. register struct re_guts *g;
  1364. {
  1365. g->nstates = p->slen;
  1366. g->strip = (sop *)realloc((unsigned char *)p->strip, p->slen * sizeof(sop));
  1367. if (g->strip == NULL) {
  1368. SETERROR(REG_ESPACE);
  1369. g->strip = p->strip;
  1370. }
  1371. }
  1372. /*
  1373. - findmust - fill in must and mlen with longest mandatory literal string
  1374. == static void findmust(register struct parse *p, register struct re_guts *g);
  1375. *
  1376. * This algorithm could do fancy things like analyzing the operands of |
  1377. * for common subsequences. Someday. This code is simple and finds most
  1378. * of the interesting cases.
  1379. *
  1380. * Note that must and mlen got initialized during setup.
  1381. */
  1382. static void
  1383. findmust(p, g)
  1384. struct parse *p;
  1385. register struct re_guts *g;
  1386. {
  1387. register sop *scan;
  1388. sop *start = NULL;
  1389. register sop *newstart = NULL;
  1390. register sopno newlen;
  1391. register sop s;
  1392. register unsigned char *cp;
  1393. register sopno i;
  1394. /* avoid making error situations worse */
  1395. if (p->error != 0)
  1396. return;
  1397. /* find the longest OCHAR sequence in strip */
  1398. newlen = 0;
  1399. scan = g->strip + 1;
  1400. do {
  1401. s = *scan++;
  1402. switch (OP(s)) {
  1403. case OCHAR: /* sequence member */
  1404. if (newlen == 0) /* new sequence */
  1405. newstart = scan - 1;
  1406. newlen++;
  1407. break;
  1408. case OPLUS_: /* things that don't break one */
  1409. case OLPAREN:
  1410. case ORPAREN:
  1411. break;
  1412. case OQUEST_: /* things that must be skipped */
  1413. case OCH_:
  1414. scan--;
  1415. do {
  1416. scan += OPND(s);
  1417. s = *scan;
  1418. /* assert() interferes w debug printouts */
  1419. if (OP(s) != O_QUEST && OP(s) != O_CH &&
  1420. OP(s) != OOR2) {
  1421. g->iflags |= BAD;
  1422. return;
  1423. }
  1424. } while (OP(s) != O_QUEST && OP(s) != O_CH);
  1425. /* fallthrough */
  1426. default: /* things that break a sequence */
  1427. if (newlen > g->mlen) { /* ends one */
  1428. start = newstart;
  1429. g->mlen = newlen;
  1430. }
  1431. newlen = 0;
  1432. break;
  1433. }
  1434. } while (OP(s) != OEND);
  1435. if (g->mlen == 0) /* there isn't one */
  1436. return;
  1437. if (!start) {
  1438. g->mlen = 0;
  1439. return;
  1440. }
  1441. /* turn it into a character string */
  1442. g->must = malloc((size_t)g->mlen + 1);
  1443. if (g->must == NULL) { /* argh; just forget it */
  1444. g->mlen = 0;
  1445. return;
  1446. }
  1447. cp = g->must;
  1448. scan = start;
  1449. for (i = g->mlen; i > 0; i--) {
  1450. while (OP(s = *scan++) != OCHAR)
  1451. continue;
  1452. assert(cp < g->must + g->mlen);
  1453. *cp++ = (unsigned char)OPND(s);
  1454. }
  1455. assert(cp == g->must + g->mlen);
  1456. *cp++ = '\0'; /* just on general principles */
  1457. }
  1458. /*
  1459. - pluscount - count + nesting
  1460. == static sopno pluscount(register struct parse *p, register struct re_guts *g);
  1461. */
  1462. static sopno /* nesting depth */
  1463. pluscount(p, g)
  1464. struct parse *p;
  1465. register struct re_guts *g;
  1466. {
  1467. register sop *scan;
  1468. register sop s;
  1469. register sopno plusnest = 0;
  1470. register sopno maxnest = 0;
  1471. if (p->error != 0)
  1472. return(0); /* there may not be an OEND */
  1473. scan = g->strip + 1;
  1474. do {
  1475. s = *scan++;
  1476. switch (OP(s)) {
  1477. case OPLUS_:
  1478. plusnest++;
  1479. break;
  1480. case O_PLUS:
  1481. if (plusnest > maxnest)
  1482. maxnest = plusnest;
  1483. plusnest--;
  1484. break;
  1485. }
  1486. } while (OP(s) != OEND);
  1487. if (plusnest != 0)
  1488. g->iflags |= BAD;
  1489. return(maxnest);
  1490. }