RegularExpression.hxx 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
  2. file Copyright.txt or https://cmake.org/licensing#kwsys for details. */
  3. // Original Copyright notice:
  4. // Copyright (C) 1991 Texas Instruments Incorporated.
  5. //
  6. // Permission is granted to any individual or institution to use, copy, modify,
  7. // and distribute this software, provided that this complete copyright and
  8. // permission notice is maintained, intact, in all copies and supporting
  9. // documentation.
  10. //
  11. // Texas Instruments Incorporated provides this software "as is" without
  12. // express or implied warranty.
  13. //
  14. // Created: MNF 06/13/89 Initial Design and Implementation
  15. // Updated: LGO 08/09/89 Inherit from Generic
  16. // Updated: MBN 09/07/89 Added conditional exception handling
  17. // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
  18. // Updated: DLS 03/22/91 New lite version
  19. //
  20. #ifndef cmsys_RegularExpression_hxx
  21. #define cmsys_RegularExpression_hxx
  22. #include <cmsys/Configure.h>
  23. #include <cmsys/Configure.hxx>
  24. #include <string>
  25. /* Disable useless Borland warnings. KWSys tries not to force things
  26. on its includers, but there is no choice here. */
  27. #if defined(__BORLANDC__)
  28. #pragma warn - 8027 /* function not inlined. */
  29. #endif
  30. namespace cmsys {
  31. // Forward declaration
  32. class RegularExpression;
  33. /** \class RegularExpressionMatch
  34. * \brief Stores the pattern matches of a RegularExpression
  35. */
  36. class cmsys_EXPORT RegularExpressionMatch
  37. {
  38. public:
  39. RegularExpressionMatch();
  40. bool isValid() const;
  41. void clear();
  42. std::string::size_type start() const;
  43. std::string::size_type end() const;
  44. std::string::size_type start(int n) const;
  45. std::string::size_type end(int n) const;
  46. std::string match(int n) const;
  47. enum
  48. {
  49. NSUBEXP = 10
  50. };
  51. private:
  52. friend class RegularExpression;
  53. const char* startp[NSUBEXP];
  54. const char* endp[NSUBEXP];
  55. const char* searchstring;
  56. };
  57. /**
  58. * \brief Creates an invalid match object
  59. */
  60. inline RegularExpressionMatch::RegularExpressionMatch()
  61. {
  62. startp[0] = 0;
  63. endp[0] = 0;
  64. searchstring = 0;
  65. }
  66. /**
  67. * \brief Returns true if the match pointers are valid
  68. */
  69. inline bool RegularExpressionMatch::isValid() const
  70. {
  71. return (this->startp[0] != 0);
  72. }
  73. /**
  74. * \brief Resets to the (invalid) construction state.
  75. */
  76. inline void RegularExpressionMatch::clear()
  77. {
  78. startp[0] = 0;
  79. endp[0] = 0;
  80. searchstring = 0;
  81. }
  82. /**
  83. * \brief Returns the start index of the full match.
  84. */
  85. inline std::string::size_type RegularExpressionMatch::start() const
  86. {
  87. return static_cast<std::string::size_type>(this->startp[0] - searchstring);
  88. }
  89. /**
  90. * \brief Returns the end index of the full match.
  91. */
  92. inline std::string::size_type RegularExpressionMatch::end() const
  93. {
  94. return static_cast<std::string::size_type>(this->endp[0] - searchstring);
  95. }
  96. /**
  97. * \brief Returns the start index of nth submatch.
  98. * start(0) is the start of the full match.
  99. */
  100. inline std::string::size_type RegularExpressionMatch::start(int n) const
  101. {
  102. return static_cast<std::string::size_type>(this->startp[n] -
  103. this->searchstring);
  104. }
  105. /**
  106. * \brief Returns the end index of nth submatch.
  107. * end(0) is the end of the full match.
  108. */
  109. inline std::string::size_type RegularExpressionMatch::end(int n) const
  110. {
  111. return static_cast<std::string::size_type>(this->endp[n] -
  112. this->searchstring);
  113. }
  114. /**
  115. * \brief Returns the nth submatch as a string.
  116. */
  117. inline std::string RegularExpressionMatch::match(int n) const
  118. {
  119. if (this->startp[n] == 0) {
  120. return std::string();
  121. } else {
  122. return std::string(this->startp[n], static_cast<std::string::size_type>(
  123. this->endp[n] - this->startp[n]));
  124. }
  125. }
  126. /** \class RegularExpression
  127. * \brief Implements pattern matching with regular expressions.
  128. *
  129. * This is the header file for the regular expression class. An object of
  130. * this class contains a regular expression, in a special "compiled" format.
  131. * This compiled format consists of several slots all kept as the objects
  132. * private data. The RegularExpression class provides a convenient way to
  133. * represent regular expressions. It makes it easy to search for the same
  134. * regular expression in many different strings without having to compile a
  135. * string to regular expression format more than necessary.
  136. *
  137. * This class implements pattern matching via regular expressions.
  138. * A regular expression allows a programmer to specify complex
  139. * patterns that can be searched for and matched against the
  140. * character string of a string object. In its simplest form, a
  141. * regular expression is a sequence of characters used to
  142. * search for exact character matches. However, many times the
  143. * exact sequence to be found is not known, or only a match at
  144. * the beginning or end of a string is desired. The RegularExpression regu-
  145. * lar expression class implements regular expression pattern
  146. * matching as is found and implemented in many UNIX commands
  147. * and utilities.
  148. *
  149. * Example: The perl code
  150. *
  151. * $filename =~ m"([a-z]+)\.cc";
  152. * print $1;
  153. *
  154. * Is written as follows in C++
  155. *
  156. * RegularExpression re("([a-z]+)\\.cc");
  157. * re.find(filename);
  158. * cerr << re.match(1);
  159. *
  160. *
  161. * The regular expression class provides a convenient mechanism
  162. * for specifying and manipulating regular expressions. The
  163. * regular expression object allows specification of such pat-
  164. * terns by using the following regular expression metacharac-
  165. * ters:
  166. *
  167. * ^ Matches at beginning of a line
  168. *
  169. * $ Matches at end of a line
  170. *
  171. * . Matches any single character
  172. *
  173. * [ ] Matches any character(s) inside the brackets
  174. *
  175. * [^ ] Matches any character(s) not inside the brackets
  176. *
  177. * - Matches any character in range on either side of a dash
  178. *
  179. * * Matches preceding pattern zero or more times
  180. *
  181. * + Matches preceding pattern one or more times
  182. *
  183. * ? Matches preceding pattern zero or once only
  184. *
  185. * () Saves a matched expression and uses it in a later match
  186. *
  187. * Note that more than one of these metacharacters can be used
  188. * in a single regular expression in order to create complex
  189. * search patterns. For example, the pattern [^ab1-9] says to
  190. * match any character sequence that does not begin with the
  191. * characters "ab" followed by numbers in the series one
  192. * through nine.
  193. *
  194. * There are three constructors for RegularExpression. One just creates an
  195. * empty RegularExpression object. Another creates a RegularExpression
  196. * object and initializes it with a regular expression that is given in the
  197. * form of a char*. The third takes a reference to a RegularExpression
  198. * object as an argument and creates an object initialized with the
  199. * information from the given RegularExpression object.
  200. *
  201. * The find member function finds the first occurrence of the regular
  202. * expression of that object in the string given to find as an argument. Find
  203. * returns a boolean, and if true, mutates the private data appropriately.
  204. * Find sets pointers to the beginning and end of the thing last found, they
  205. * are pointers into the actual string that was searched. The start and end
  206. * member functions return indices into the searched string that correspond
  207. * to the beginning and end pointers respectively. The compile member
  208. * function takes a char* and puts the compiled version of the char* argument
  209. * into the object's private data fields. The == and != operators only check
  210. * the to see if the compiled regular expression is the same, and the
  211. * deep_equal functions also checks to see if the start and end pointers are
  212. * the same. The is_valid function returns false if program is set to NULL,
  213. * (i.e. there is no valid compiled exression). The set_invalid function sets
  214. * the program to NULL (Warning: this deletes the compiled expression). The
  215. * following examples may help clarify regular expression usage:
  216. *
  217. * * The regular expression "^hello" matches a "hello" only at the
  218. * beginning of a line. It would match "hello there" but not "hi,
  219. * hello there".
  220. *
  221. * * The regular expression "long$" matches a "long" only at the end
  222. * of a line. It would match "so long\0", but not "long ago".
  223. *
  224. * * The regular expression "t..t..g" will match anything that has a
  225. * "t" then any two characters, another "t", any two characters and
  226. * then a "g". It will match "testing", or "test again" but would
  227. * not match "toasting"
  228. *
  229. * * The regular expression "[1-9ab]" matches any number one through
  230. * nine, and the characters "a" and "b". It would match "hello 1"
  231. * or "begin", but would not match "no-match".
  232. *
  233. * * The regular expression "[^1-9ab]" matches any character that is
  234. * not a number one through nine, or an "a" or "b". It would NOT
  235. * match "hello 1" or "begin", but would match "no-match".
  236. *
  237. * * The regular expression "br* " matches something that begins with
  238. * a "b", is followed by zero or more "r"s, and ends in a space. It
  239. * would match "brrrrr ", and "b ", but would not match "brrh ".
  240. *
  241. * * The regular expression "br+ " matches something that begins with
  242. * a "b", is followed by one or more "r"s, and ends in a space. It
  243. * would match "brrrrr ", and "br ", but would not match "b " or
  244. * "brrh ".
  245. *
  246. * * The regular expression "br? " matches something that begins with
  247. * a "b", is followed by zero or one "r"s, and ends in a space. It
  248. * would match "br ", and "b ", but would not match "brrrr " or
  249. * "brrh ".
  250. *
  251. * * The regular expression "(..p)b" matches something ending with pb
  252. * and beginning with whatever the two characters before the first p
  253. * encounterd in the line were. It would find "repb" in "rep drepa
  254. * qrepb". The regular expression "(..p)a" would find "repa qrepb"
  255. * in "rep drepa qrepb"
  256. *
  257. * * The regular expression "d(..p)" matches something ending with p,
  258. * beginning with d, and having two characters in between that are
  259. * the same as the two characters before the first p encounterd in
  260. * the line. It would match "drepa qrepb" in "rep drepa qrepb".
  261. *
  262. * All methods of RegularExpression can be called simultaneously from
  263. * different threads but only if each invocation uses an own instance of
  264. * RegularExpression.
  265. */
  266. class cmsys_EXPORT RegularExpression
  267. {
  268. public:
  269. /**
  270. * Instantiate RegularExpression with program=NULL.
  271. */
  272. inline RegularExpression();
  273. /**
  274. * Instantiate RegularExpression with compiled char*.
  275. */
  276. inline RegularExpression(char const*);
  277. /**
  278. * Instantiate RegularExpression as a copy of another regular expression.
  279. */
  280. RegularExpression(RegularExpression const&);
  281. /**
  282. * Instantiate RegularExpression with compiled string.
  283. */
  284. inline RegularExpression(std::string const&);
  285. /**
  286. * Destructor.
  287. */
  288. inline ~RegularExpression();
  289. /**
  290. * Compile a regular expression into internal code
  291. * for later pattern matching.
  292. */
  293. bool compile(char const*);
  294. /**
  295. * Compile a regular expression into internal code
  296. * for later pattern matching.
  297. */
  298. inline bool compile(std::string const&);
  299. /**
  300. * Matches the regular expression to the given string.
  301. * Returns true if found, and sets start and end indexes
  302. * in the RegularExpressionMatch instance accordingly.
  303. *
  304. * This method is thread safe when called with different
  305. * RegularExpressionMatch instances.
  306. */
  307. bool find(char const*, RegularExpressionMatch&) const;
  308. /**
  309. * Matches the regular expression to the given string.
  310. * Returns true if found, and sets start and end indexes accordingly.
  311. */
  312. inline bool find(char const*);
  313. /**
  314. * Matches the regular expression to the given std string.
  315. * Returns true if found, and sets start and end indexes accordingly.
  316. */
  317. inline bool find(std::string const&);
  318. /**
  319. * Match indices
  320. */
  321. inline RegularExpressionMatch const& regMatch() const;
  322. inline std::string::size_type start() const;
  323. inline std::string::size_type end() const;
  324. inline std::string::size_type start(int n) const;
  325. inline std::string::size_type end(int n) const;
  326. /**
  327. * Match strings
  328. */
  329. inline std::string match(int n) const;
  330. /**
  331. * Copy the given regular expression.
  332. */
  333. RegularExpression& operator=(const RegularExpression& rxp);
  334. /**
  335. * Returns true if two regular expressions have the same
  336. * compiled program for pattern matching.
  337. */
  338. bool operator==(RegularExpression const&) const;
  339. /**
  340. * Returns true if two regular expressions have different
  341. * compiled program for pattern matching.
  342. */
  343. inline bool operator!=(RegularExpression const&) const;
  344. /**
  345. * Returns true if have the same compiled regular expressions
  346. * and the same start and end pointers.
  347. */
  348. bool deep_equal(RegularExpression const&) const;
  349. /**
  350. * True if the compiled regexp is valid.
  351. */
  352. inline bool is_valid() const;
  353. /**
  354. * Marks the regular expression as invalid.
  355. */
  356. inline void set_invalid();
  357. private:
  358. RegularExpressionMatch regmatch;
  359. char regstart; // Internal use only
  360. char reganch; // Internal use only
  361. const char* regmust; // Internal use only
  362. std::string::size_type regmlen; // Internal use only
  363. char* program;
  364. int progsize;
  365. };
  366. /**
  367. * Create an empty regular expression.
  368. */
  369. inline RegularExpression::RegularExpression()
  370. {
  371. this->program = 0;
  372. }
  373. /**
  374. * Creates a regular expression from string s, and
  375. * compiles s.
  376. */
  377. inline RegularExpression::RegularExpression(const char* s)
  378. {
  379. this->program = 0;
  380. if (s) {
  381. this->compile(s);
  382. }
  383. }
  384. /**
  385. * Creates a regular expression from string s, and
  386. * compiles s.
  387. */
  388. inline RegularExpression::RegularExpression(const std::string& s)
  389. {
  390. this->program = 0;
  391. this->compile(s);
  392. }
  393. /**
  394. * Destroys and frees space allocated for the regular expression.
  395. */
  396. inline RegularExpression::~RegularExpression()
  397. {
  398. //#ifndef _WIN32
  399. delete[] this->program;
  400. //#endif
  401. }
  402. /**
  403. * Compile a regular expression into internal code
  404. * for later pattern matching.
  405. */
  406. inline bool RegularExpression::compile(std::string const& s)
  407. {
  408. return this->compile(s.c_str());
  409. }
  410. /**
  411. * Matches the regular expression to the given std string.
  412. * Returns true if found, and sets start and end indexes accordingly.
  413. */
  414. inline bool RegularExpression::find(const char* s)
  415. {
  416. return this->find(s, this->regmatch);
  417. }
  418. /**
  419. * Matches the regular expression to the given std string.
  420. * Returns true if found, and sets start and end indexes accordingly.
  421. */
  422. inline bool RegularExpression::find(std::string const& s)
  423. {
  424. return this->find(s.c_str());
  425. }
  426. /**
  427. * Returns the internal match object
  428. */
  429. inline RegularExpressionMatch const& RegularExpression::regMatch() const
  430. {
  431. return this->regmatch;
  432. }
  433. /**
  434. * Returns the start index of the full match.
  435. */
  436. inline std::string::size_type RegularExpression::start() const
  437. {
  438. return regmatch.start();
  439. }
  440. /**
  441. * Returns the end index of the full match.
  442. */
  443. inline std::string::size_type RegularExpression::end() const
  444. {
  445. return regmatch.end();
  446. }
  447. /**
  448. * Return start index of nth submatch. start(0) is the start of the full match.
  449. */
  450. inline std::string::size_type RegularExpression::start(int n) const
  451. {
  452. return regmatch.start(n);
  453. }
  454. /**
  455. * Return end index of nth submatch. end(0) is the end of the full match.
  456. */
  457. inline std::string::size_type RegularExpression::end(int n) const
  458. {
  459. return regmatch.end(n);
  460. }
  461. /**
  462. * Return nth submatch as a string.
  463. */
  464. inline std::string RegularExpression::match(int n) const
  465. {
  466. return regmatch.match(n);
  467. }
  468. /**
  469. * Returns true if two regular expressions have different
  470. * compiled program for pattern matching.
  471. */
  472. inline bool RegularExpression::operator!=(const RegularExpression& r) const
  473. {
  474. return (!(*this == r));
  475. }
  476. /**
  477. * Returns true if a valid regular expression is compiled
  478. * and ready for pattern matching.
  479. */
  480. inline bool RegularExpression::is_valid() const
  481. {
  482. return (this->program != 0);
  483. }
  484. inline void RegularExpression::set_invalid()
  485. {
  486. //#ifndef _WIN32
  487. delete[] this->program;
  488. //#endif
  489. this->program = 0;
  490. }
  491. } // namespace cmsys
  492. #endif