RegularExpression.hxx.in 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
  2. file Copyright.txt or https://cmake.org/licensing#kwsys for details. */
  3. // Original Copyright notice:
  4. // Copyright (C) 1991 Texas Instruments Incorporated.
  5. //
  6. // Permission is granted to any individual or institution to use, copy, modify,
  7. // and distribute this software, provided that this complete copyright and
  8. // permission notice is maintained, intact, in all copies and supporting
  9. // documentation.
  10. //
  11. // Texas Instruments Incorporated provides this software "as is" without
  12. // express or implied warranty.
  13. //
  14. // Created: MNF 06/13/89 Initial Design and Implementation
  15. // Updated: LGO 08/09/89 Inherit from Generic
  16. // Updated: MBN 09/07/89 Added conditional exception handling
  17. // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
  18. // Updated: DLS 03/22/91 New lite version
  19. //
  20. #ifndef @KWSYS_NAMESPACE@_RegularExpression_hxx
  21. #define @KWSYS_NAMESPACE@_RegularExpression_hxx
  22. #include <@KWSYS_NAMESPACE@/Configure.h>
  23. #include <@KWSYS_NAMESPACE@/Configure.hxx>
  24. #include <string>
  25. namespace @KWSYS_NAMESPACE@ {
  26. // Forward declaration
  27. class RegularExpression;
  28. /** \class RegularExpressionMatch
  29. * \brief Stores the pattern matches of a RegularExpression
  30. */
  31. class @KWSYS_NAMESPACE@_EXPORT RegularExpressionMatch
  32. {
  33. public:
  34. RegularExpressionMatch();
  35. bool isValid() const;
  36. void clear();
  37. std::string::size_type start() const;
  38. std::string::size_type end() const;
  39. std::string::size_type start(int n) const;
  40. std::string::size_type end(int n) const;
  41. std::string match(int n) const;
  42. enum
  43. {
  44. NSUBEXP = 10
  45. };
  46. private:
  47. friend class RegularExpression;
  48. const char* startp[NSUBEXP];
  49. const char* endp[NSUBEXP];
  50. const char* searchstring;
  51. };
  52. #ifdef _MSC_VER
  53. # pragma warning(push)
  54. # if _MSC_VER < 1900
  55. # pragma warning(disable : 4351) /* new behavior */
  56. # endif
  57. #endif
  58. /**
  59. * \brief Creates an invalid match object
  60. */
  61. inline RegularExpressionMatch::RegularExpressionMatch()
  62. : startp{}
  63. , endp{}
  64. , searchstring{}
  65. {
  66. }
  67. #ifdef _MSC_VER
  68. # pragma warning(pop)
  69. #endif
  70. /**
  71. * \brief Returns true if the match pointers are valid
  72. */
  73. inline bool RegularExpressionMatch::isValid() const
  74. {
  75. return (this->startp[0] != nullptr);
  76. }
  77. /**
  78. * \brief Resets to the (invalid) construction state.
  79. */
  80. inline void RegularExpressionMatch::clear()
  81. {
  82. startp[0] = nullptr;
  83. endp[0] = nullptr;
  84. searchstring = nullptr;
  85. }
  86. /**
  87. * \brief Returns the start index of the full match.
  88. */
  89. inline std::string::size_type RegularExpressionMatch::start() const
  90. {
  91. return static_cast<std::string::size_type>(this->startp[0] - searchstring);
  92. }
  93. /**
  94. * \brief Returns the end index of the full match.
  95. */
  96. inline std::string::size_type RegularExpressionMatch::end() const
  97. {
  98. return static_cast<std::string::size_type>(this->endp[0] - searchstring);
  99. }
  100. /**
  101. * \brief Returns the start index of nth submatch.
  102. * start(0) is the start of the full match.
  103. */
  104. inline std::string::size_type RegularExpressionMatch::start(int n) const
  105. {
  106. return static_cast<std::string::size_type>(this->startp[n] -
  107. this->searchstring);
  108. }
  109. /**
  110. * \brief Returns the end index of nth submatch.
  111. * end(0) is the end of the full match.
  112. */
  113. inline std::string::size_type RegularExpressionMatch::end(int n) const
  114. {
  115. return static_cast<std::string::size_type>(this->endp[n] -
  116. this->searchstring);
  117. }
  118. /**
  119. * \brief Returns the nth submatch as a string.
  120. */
  121. inline std::string RegularExpressionMatch::match(int n) const
  122. {
  123. if (this->startp[n] == nullptr) {
  124. return std::string();
  125. } else {
  126. return std::string(
  127. this->startp[n],
  128. static_cast<std::string::size_type>(this->endp[n] - this->startp[n]));
  129. }
  130. }
  131. /** \class RegularExpression
  132. * \brief Implements pattern matching with regular expressions.
  133. *
  134. * This is the header file for the regular expression class. An object of
  135. * this class contains a regular expression, in a special "compiled" format.
  136. * This compiled format consists of several slots all kept as the objects
  137. * private data. The RegularExpression class provides a convenient way to
  138. * represent regular expressions. It makes it easy to search for the same
  139. * regular expression in many different strings without having to compile a
  140. * string to regular expression format more than necessary.
  141. *
  142. * This class implements pattern matching via regular expressions.
  143. * A regular expression allows a programmer to specify complex
  144. * patterns that can be searched for and matched against the
  145. * character string of a string object. In its simplest form, a
  146. * regular expression is a sequence of characters used to
  147. * search for exact character matches. However, many times the
  148. * exact sequence to be found is not known, or only a match at
  149. * the beginning or end of a string is desired. The RegularExpression regu-
  150. * lar expression class implements regular expression pattern
  151. * matching as is found and implemented in many UNIX commands
  152. * and utilities.
  153. *
  154. * Example: The perl code
  155. *
  156. * $filename =~ m"([a-z]+)\.cc";
  157. * print $1;
  158. *
  159. * Is written as follows in C++
  160. *
  161. * RegularExpression re("([a-z]+)\\.cc");
  162. * re.find(filename);
  163. * cerr << re.match(1);
  164. *
  165. *
  166. * The regular expression class provides a convenient mechanism
  167. * for specifying and manipulating regular expressions. The
  168. * regular expression object allows specification of such pat-
  169. * terns by using the following regular expression metacharac-
  170. * ters:
  171. *
  172. * ^ Matches at beginning of a line
  173. *
  174. * $ Matches at end of a line
  175. *
  176. * . Matches any single character
  177. *
  178. * [ ] Matches any character(s) inside the brackets
  179. *
  180. * [^ ] Matches any character(s) not inside the brackets
  181. *
  182. * - Matches any character in range on either side of a dash
  183. *
  184. * * Matches preceding pattern zero or more times
  185. *
  186. * + Matches preceding pattern one or more times
  187. *
  188. * ? Matches preceding pattern zero or once only
  189. *
  190. * () Saves a matched expression and uses it in a later match
  191. *
  192. * Note that more than one of these metacharacters can be used
  193. * in a single regular expression in order to create complex
  194. * search patterns. For example, the pattern [^ab1-9] says to
  195. * match any character sequence that does not begin with the
  196. * characters "ab" followed by numbers in the series one
  197. * through nine.
  198. *
  199. * There are three constructors for RegularExpression. One just creates an
  200. * empty RegularExpression object. Another creates a RegularExpression
  201. * object and initializes it with a regular expression that is given in the
  202. * form of a char*. The third takes a reference to a RegularExpression
  203. * object as an argument and creates an object initialized with the
  204. * information from the given RegularExpression object.
  205. *
  206. * The find member function finds the first occurrence of the regular
  207. * expression of that object in the string given to find as an argument. Find
  208. * returns a boolean, and if true, mutates the private data appropriately.
  209. * Find sets pointers to the beginning and end of the thing last found, they
  210. * are pointers into the actual string that was searched. The start and end
  211. * member functions return indices into the searched string that correspond
  212. * to the beginning and end pointers respectively. The compile member
  213. * function takes a char* and puts the compiled version of the char* argument
  214. * into the object's private data fields. The == and != operators only check
  215. * the to see if the compiled regular expression is the same, and the
  216. * deep_equal functions also checks to see if the start and end pointers are
  217. * the same. The is_valid function returns false if program is set to
  218. * nullptr, (i.e. there is no valid compiled expression). The set_invalid
  219. * function sets the program to nullptr (Warning: this deletes the compiled
  220. * expression). The following examples may help clarify regular expression
  221. * usage:
  222. *
  223. * * The regular expression "^hello" matches a "hello" only at the
  224. * beginning of a line. It would match "hello there" but not "hi,
  225. * hello there".
  226. *
  227. * * The regular expression "long$" matches a "long" only at the end
  228. * of a line. It would match "so long\0", but not "long ago".
  229. *
  230. * * The regular expression "t..t..g" will match anything that has a
  231. * "t" then any two characters, another "t", any two characters and
  232. * then a "g". It will match "testing", or "test again" but would
  233. * not match "toasting"
  234. *
  235. * * The regular expression "[1-9ab]" matches any number one through
  236. * nine, and the characters "a" and "b". It would match "hello 1"
  237. * or "begin", but would not match "no-match".
  238. *
  239. * * The regular expression "[^1-9ab]" matches any character that is
  240. * not a number one through nine, or an "a" or "b". It would NOT
  241. * match "hello 1" or "begin", but would match "no-match".
  242. *
  243. * * The regular expression "br* " matches something that begins with
  244. * a "b", is followed by zero or more "r"s, and ends in a space. It
  245. * would match "brrrrr ", and "b ", but would not match "brrh ".
  246. *
  247. * * The regular expression "br+ " matches something that begins with
  248. * a "b", is followed by one or more "r"s, and ends in a space. It
  249. * would match "brrrrr ", and "br ", but would not match "b " or
  250. * "brrh ".
  251. *
  252. * * The regular expression "br? " matches something that begins with
  253. * a "b", is followed by zero or one "r"s, and ends in a space. It
  254. * would match "br ", and "b ", but would not match "brrrr " or
  255. * "brrh ".
  256. *
  257. * * The regular expression "(..p)b" matches something ending with pb
  258. * and beginning with whatever the two characters before the first p
  259. * encountered in the line were. It would find "repb" in "rep drepa
  260. * qrepb". The regular expression "(..p)a" would find "repa qrepb"
  261. * in "rep drepa qrepb"
  262. *
  263. * * The regular expression "d(..p)" matches something ending with p,
  264. * beginning with d, and having two characters in between that are
  265. * the same as the two characters before the first p encountered in
  266. * the line. It would match "drepa qrepb" in "rep drepa qrepb".
  267. *
  268. * All methods of RegularExpression can be called simultaneously from
  269. * different threads but only if each invocation uses an own instance of
  270. * RegularExpression.
  271. */
  272. class @KWSYS_NAMESPACE@_EXPORT RegularExpression
  273. {
  274. public:
  275. /**
  276. * Instantiate RegularExpression with program=nullptr.
  277. */
  278. inline RegularExpression();
  279. /**
  280. * Instantiate RegularExpression with compiled char*.
  281. */
  282. inline RegularExpression(char const*);
  283. /**
  284. * Instantiate RegularExpression as a copy of another regular expression.
  285. */
  286. RegularExpression(RegularExpression const&);
  287. /**
  288. * Instantiate RegularExpression with compiled string.
  289. */
  290. inline RegularExpression(std::string const&);
  291. /**
  292. * Destructor.
  293. */
  294. inline ~RegularExpression();
  295. /**
  296. * Compile a regular expression into internal code
  297. * for later pattern matching.
  298. */
  299. bool compile(char const*);
  300. /**
  301. * Compile a regular expression into internal code
  302. * for later pattern matching.
  303. */
  304. inline bool compile(std::string const&);
  305. /**
  306. * Matches the regular expression to the given string.
  307. * Returns true if found, and sets start and end indexes
  308. * in the RegularExpressionMatch instance accordingly.
  309. *
  310. * This method is thread safe when called with different
  311. * RegularExpressionMatch instances.
  312. */
  313. bool find(char const*, RegularExpressionMatch&) const;
  314. /**
  315. * Matches the regular expression to the given string.
  316. * Returns true if found, and sets start and end indexes accordingly.
  317. */
  318. inline bool find(char const*);
  319. /**
  320. * Matches the regular expression to the given std string.
  321. * Returns true if found, and sets start and end indexes accordingly.
  322. */
  323. inline bool find(std::string const&);
  324. /**
  325. * Match indices
  326. */
  327. inline RegularExpressionMatch const& regMatch() const;
  328. inline std::string::size_type start() const;
  329. inline std::string::size_type end() const;
  330. inline std::string::size_type start(int n) const;
  331. inline std::string::size_type end(int n) const;
  332. /**
  333. * Match strings
  334. */
  335. inline std::string match(int n) const;
  336. /**
  337. * Copy the given regular expression.
  338. */
  339. RegularExpression& operator=(const RegularExpression& rxp);
  340. /**
  341. * Returns true if two regular expressions have the same
  342. * compiled program for pattern matching.
  343. */
  344. bool operator==(RegularExpression const&) const;
  345. /**
  346. * Returns true if two regular expressions have different
  347. * compiled program for pattern matching.
  348. */
  349. inline bool operator!=(RegularExpression const&) const;
  350. /**
  351. * Returns true if have the same compiled regular expressions
  352. * and the same start and end pointers.
  353. */
  354. bool deep_equal(RegularExpression const&) const;
  355. /**
  356. * True if the compiled regexp is valid.
  357. */
  358. inline bool is_valid() const;
  359. /**
  360. * Marks the regular expression as invalid.
  361. */
  362. inline void set_invalid();
  363. private:
  364. RegularExpressionMatch regmatch;
  365. char regstart; // Internal use only
  366. char reganch; // Internal use only
  367. const char* regmust; // Internal use only
  368. std::string::size_type regmlen; // Internal use only
  369. char* program;
  370. int progsize;
  371. };
  372. /**
  373. * Create an empty regular expression.
  374. */
  375. inline RegularExpression::RegularExpression()
  376. : regstart{}
  377. , reganch{}
  378. , regmust{}
  379. , program{ nullptr }
  380. , progsize{}
  381. {
  382. }
  383. /**
  384. * Creates a regular expression from string s, and
  385. * compiles s.
  386. */
  387. inline RegularExpression::RegularExpression(const char* s)
  388. : regstart{}
  389. , reganch{}
  390. , regmust{}
  391. , program{ nullptr }
  392. , progsize{}
  393. {
  394. if (s) {
  395. this->compile(s);
  396. }
  397. }
  398. /**
  399. * Creates a regular expression from string s, and
  400. * compiles s.
  401. */
  402. inline RegularExpression::RegularExpression(const std::string& s)
  403. : regstart{}
  404. , reganch{}
  405. , regmust{}
  406. , program{ nullptr }
  407. , progsize{}
  408. {
  409. this->compile(s);
  410. }
  411. /**
  412. * Destroys and frees space allocated for the regular expression.
  413. */
  414. inline RegularExpression::~RegularExpression()
  415. {
  416. //#ifndef _WIN32
  417. delete[] this->program;
  418. //#endif
  419. }
  420. /**
  421. * Compile a regular expression into internal code
  422. * for later pattern matching.
  423. */
  424. inline bool RegularExpression::compile(std::string const& s)
  425. {
  426. return this->compile(s.c_str());
  427. }
  428. /**
  429. * Matches the regular expression to the given std string.
  430. * Returns true if found, and sets start and end indexes accordingly.
  431. */
  432. inline bool RegularExpression::find(const char* s)
  433. {
  434. return this->find(s, this->regmatch);
  435. }
  436. /**
  437. * Matches the regular expression to the given std string.
  438. * Returns true if found, and sets start and end indexes accordingly.
  439. */
  440. inline bool RegularExpression::find(std::string const& s)
  441. {
  442. return this->find(s.c_str());
  443. }
  444. /**
  445. * Returns the internal match object
  446. */
  447. inline RegularExpressionMatch const& RegularExpression::regMatch() const
  448. {
  449. return this->regmatch;
  450. }
  451. /**
  452. * Returns the start index of the full match.
  453. */
  454. inline std::string::size_type RegularExpression::start() const
  455. {
  456. return regmatch.start();
  457. }
  458. /**
  459. * Returns the end index of the full match.
  460. */
  461. inline std::string::size_type RegularExpression::end() const
  462. {
  463. return regmatch.end();
  464. }
  465. /**
  466. * Return start index of nth submatch. start(0) is the start of the full match.
  467. */
  468. inline std::string::size_type RegularExpression::start(int n) const
  469. {
  470. return regmatch.start(n);
  471. }
  472. /**
  473. * Return end index of nth submatch. end(0) is the end of the full match.
  474. */
  475. inline std::string::size_type RegularExpression::end(int n) const
  476. {
  477. return regmatch.end(n);
  478. }
  479. /**
  480. * Return nth submatch as a string.
  481. */
  482. inline std::string RegularExpression::match(int n) const
  483. {
  484. return regmatch.match(n);
  485. }
  486. /**
  487. * Returns true if two regular expressions have different
  488. * compiled program for pattern matching.
  489. */
  490. inline bool RegularExpression::operator!=(const RegularExpression& r) const
  491. {
  492. return (!(*this == r));
  493. }
  494. /**
  495. * Returns true if a valid regular expression is compiled
  496. * and ready for pattern matching.
  497. */
  498. inline bool RegularExpression::is_valid() const
  499. {
  500. return (this->program != nullptr);
  501. }
  502. inline void RegularExpression::set_invalid()
  503. {
  504. //#ifndef _WIN32
  505. delete[] this->program;
  506. //#endif
  507. this->program = nullptr;
  508. }
  509. } // namespace @KWSYS_NAMESPACE@
  510. #endif