cmRegularExpression.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. /*=========================================================================
  2. Program: Insight Segmentation & Registration Toolkit
  3. Module: $RCSfile$
  4. Language: C++
  5. Date: $Date$
  6. Version: $Revision$
  7. Copyright (c) 2000 National Library of Medicine
  8. All rights reserved.
  9. See COPYRIGHT.txt for copyright details.
  10. =========================================================================*/
  11. // Original Copyright notice:
  12. // Copyright (C) 1991 Texas Instruments Incorporated.
  13. //
  14. // Permission is granted to any individual or institution to use, copy, modify,
  15. // and distribute this software, provided that this complete copyright and
  16. // permission notice is maintained, intact, in all copies and supporting
  17. // documentation.
  18. //
  19. // Texas Instruments Incorporated provides this software "as is" without
  20. // express or implied warranty.
  21. //
  22. // Created: MNF 06/13/89 Initial Design and Implementation
  23. // Updated: LGO 08/09/89 Inherit from Generic
  24. // Updated: MBN 09/07/89 Added conditional exception handling
  25. // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
  26. // Updated: DLS 03/22/91 New lite version
  27. //
  28. #ifndef cmRegularExpression_h
  29. #define cmRegularExpression_h
  30. #include "cmStandardIncludes.h"
  31. const int NSUBEXP = 10;
  32. /** \class cmRegularExpression
  33. * \brief Implements pattern matching with regular expressions.
  34. *
  35. * This is the header file for the regular expression class. An object of
  36. * this class contains a regular expression, in a special "compiled" format.
  37. * This compiled format consists of several slots all kept as the objects
  38. * private data. The cmRegularExpression class provides a convenient way to
  39. * represent regular expressions. It makes it easy to search for the same
  40. * regular expression in many different strings without having to compile a
  41. * string to regular expression format more than necessary.
  42. *
  43. * This class implements pattern matching via regular expressions.
  44. * A regular expression allows a programmer to specify complex
  45. * patterns that can be searched for and matched against the
  46. * character string of a string object. In its simplest form, a
  47. * regular expression is a sequence of characters used to
  48. * search for exact character matches. However, many times the
  49. * exact sequence to be found is not known, or only a match at
  50. * the beginning or end of a string is desired. The cmRegularExpression regu-
  51. * lar expression class implements regular expression pattern
  52. * matching as is found and implemented in many UNIX commands
  53. * and utilities.
  54. *
  55. * Example: The perl code
  56. *
  57. * $filename =~ m"([a-z]+)\.cc";
  58. * print $1;
  59. *
  60. * Is written as follows in C++
  61. *
  62. * cmRegularExpression re("([a-z]+)\\.cc");
  63. * re.find(filename);
  64. * cerr << re.match(1);
  65. *
  66. *
  67. * The regular expression class provides a convenient mechanism
  68. * for specifying and manipulating regular expressions. The
  69. * regular expression object allows specification of such pat-
  70. * terns by using the following regular expression metacharac-
  71. * ters:
  72. *
  73. * ^ Matches at beginning of a line
  74. *
  75. * $ Matches at end of a line
  76. *
  77. * . Matches any single character
  78. *
  79. * [ ] Matches any character(s) inside the brackets
  80. *
  81. * [^ ] Matches any character(s) not inside the brackets
  82. *
  83. * - Matches any character in range on either side of a dash
  84. *
  85. * * Matches preceding pattern zero or more times
  86. *
  87. * + Matches preceding pattern one or more times
  88. *
  89. * ? Matches preceding pattern zero or once only
  90. *
  91. * () Saves a matched expression and uses it in a later match
  92. *
  93. * Note that more than one of these metacharacters can be used
  94. * in a single regular expression in order to create complex
  95. * search patterns. For example, the pattern [^ab1-9] says to
  96. * match any character sequence that does not begin with the
  97. * characters "ab" followed by numbers in the series one
  98. * through nine.
  99. *
  100. * There are three constructors for cmRegularExpression. One just creates an
  101. * empty cmRegularExpression object. Another creates a cmRegularExpression
  102. * object and initializes it with a regular expression that is given in the
  103. * form of a char*. The third takes a reference to a cmRegularExpression
  104. * object as an argument and creates an object initialized with the
  105. * information from the given cmRegularExpression object.
  106. *
  107. * The find member function finds the first occurence of the regualr
  108. * expression of that object in the string given to find as an argument. Find
  109. * returns a boolean, and if true, mutates the private data appropriately.
  110. * Find sets pointers to the beginning and end of the thing last found, they
  111. * are pointers into the actual string that was searched. The start and end
  112. * member functions return indicies into the searched string that correspond
  113. * to the beginning and end pointers respectively. The compile member
  114. * function takes a char* and puts the compiled version of the char* argument
  115. * into the object's private data fields. The == and != operators only check
  116. * the to see if the compiled regular expression is the same, and the
  117. * deep_equal functions also checks to see if the start and end pointers are
  118. * the same. The is_valid function returns false if program is set to NULL,
  119. * (i.e. there is no valid compiled exression). The set_invalid function sets
  120. * the program to NULL (Warning: this deletes the compiled expression). The
  121. * following examples may help clarify regular expression usage:
  122. *
  123. * * The regular expression "^hello" matches a "hello" only at the
  124. * beginning of a line. It would match "hello there" but not "hi,
  125. * hello there".
  126. *
  127. * * The regular expression "long$" matches a "long" only at the end
  128. * of a line. It would match "so long\0", but not "long ago".
  129. *
  130. * * The regular expression "t..t..g" will match anything that has a
  131. * "t" then any two characters, another "t", any two characters and
  132. * then a "g". It will match "testing", or "test again" but would
  133. * not match "toasting"
  134. *
  135. * * The regular expression "[1-9ab]" matches any number one through
  136. * nine, and the characters "a" and "b". It would match "hello 1"
  137. * or "begin", but would not match "no-match".
  138. *
  139. * * The regular expression "[^1-9ab]" matches any character that is
  140. * not a number one through nine, or an "a" or "b". It would NOT
  141. * match "hello 1" or "begin", but would match "no-match".
  142. *
  143. * * The regular expression "br* " matches something that begins with
  144. * a "b", is followed by zero or more "r"s, and ends in a space. It
  145. * would match "brrrrr ", and "b ", but would not match "brrh ".
  146. *
  147. * * The regular expression "br+ " matches something that begins with
  148. * a "b", is followed by one or more "r"s, and ends in a space. It
  149. * would match "brrrrr ", and "br ", but would not match "b " or
  150. * "brrh ".
  151. *
  152. * * The regular expression "br? " matches something that begins with
  153. * a "b", is followed by zero or one "r"s, and ends in a space. It
  154. * would match "br ", and "b ", but would not match "brrrr " or
  155. * "brrh ".
  156. *
  157. * * The regular expression "(..p)b" matches something ending with pb
  158. * and beginning with whatever the two characters before the first p
  159. * encounterd in the line were. It would find "repb" in "rep drepa
  160. * qrepb". The regular expression "(..p)a" would find "repa qrepb"
  161. * in "rep drepa qrepb"
  162. *
  163. * * The regular expression "d(..p)" matches something ending with p,
  164. * beginning with d, and having two characters in between that are
  165. * the same as the two characters before the first p encounterd in
  166. * the line. It would match "drepa qrepb" in "rep drepa qrepb".
  167. *
  168. */
  169. class cmRegularExpression
  170. {
  171. public:
  172. /**
  173. * Instantiate cmRegularExpression with program=NULL.
  174. */
  175. inline cmRegularExpression ();
  176. /**
  177. * Instantiate cmRegularExpression with compiled char*.
  178. */
  179. inline cmRegularExpression (char const*);
  180. /**
  181. * Instantiate cmRegularExpression as a copy of another regular expression.
  182. */
  183. cmRegularExpression (cmRegularExpression const&);
  184. /**
  185. * Destructor.
  186. */
  187. inline ~cmRegularExpression();
  188. /**
  189. * Compile a regular expression into internal code
  190. * for later pattern matching.
  191. */
  192. void compile (char const*);
  193. /**
  194. * Matches the regular expression to the given string.
  195. * Returns true if found, and sets start and end indexes accordingly.
  196. */
  197. bool find (char const*);
  198. /**
  199. * Matches the regular expression to the given std string.
  200. * Returns true if found, and sets start and end indexes accordingly.
  201. */
  202. bool find (std::string const&);
  203. /**
  204. * Index to start of first find.
  205. */
  206. inline long start() const;
  207. /**
  208. * Index to end of first find.
  209. */
  210. inline long end() const;
  211. /**
  212. * Returns true if two regular expressions have the same
  213. * compiled program for pattern matching.
  214. */
  215. bool operator== (cmRegularExpression const&) const;
  216. /**
  217. * Returns true if two regular expressions have different
  218. * compiled program for pattern matching.
  219. */
  220. inline bool operator!= (cmRegularExpression const&) const;
  221. /**
  222. * Returns true if have the same compiled regular expressions
  223. * and the same start and end pointers.
  224. */
  225. bool deep_equal (cmRegularExpression const&) const;
  226. /**
  227. * True if the compiled regexp is valid.
  228. */
  229. inline bool is_valid() const;
  230. /**
  231. * Marks the regular expression as invalid.
  232. */
  233. inline void set_invalid();
  234. /**
  235. * Destructor.
  236. */
  237. // awf added
  238. int start(int n) const;
  239. int end(int n) const;
  240. std::string match(int n) const;
  241. private:
  242. const char* startp[NSUBEXP];
  243. const char* endp[NSUBEXP];
  244. char regstart; // Internal use only
  245. char reganch; // Internal use only
  246. const char* regmust; // Internal use only
  247. int regmlen; // Internal use only
  248. char* program;
  249. int progsize;
  250. const char* searchstring;
  251. };
  252. /**
  253. * Create an empty regular expression.
  254. */
  255. inline cmRegularExpression::cmRegularExpression ()
  256. {
  257. this->program = NULL;
  258. }
  259. /**
  260. * Creates a regular expression from string s, and
  261. * compiles s.
  262. */
  263. inline cmRegularExpression::cmRegularExpression (const char* s)
  264. {
  265. this->program = NULL;
  266. compile(s);
  267. }
  268. /**
  269. * Destroys and frees space allocated for the regular expression.
  270. */
  271. inline cmRegularExpression::~cmRegularExpression ()
  272. {
  273. //#ifndef WIN32
  274. delete [] this->program;
  275. //#endif
  276. }
  277. /**
  278. * Set the start position for the regular expression.
  279. */
  280. inline long cmRegularExpression::start () const
  281. {
  282. return(this->startp[0] - searchstring);
  283. }
  284. /**
  285. * Returns the start/end index of the last item found.
  286. */
  287. inline long cmRegularExpression::end () const
  288. {
  289. return(this->endp[0] - searchstring);
  290. }
  291. /**
  292. * Returns true if two regular expressions have different
  293. * compiled program for pattern matching.
  294. */
  295. inline bool cmRegularExpression::operator!= (const cmRegularExpression& r) const
  296. {
  297. return(!(*this == r));
  298. }
  299. /**
  300. * Returns true if a valid regular expression is compiled
  301. * and ready for pattern matching.
  302. */
  303. inline bool cmRegularExpression::is_valid () const
  304. {
  305. return (this->program != NULL);
  306. }
  307. inline void cmRegularExpression::set_invalid ()
  308. {
  309. //#ifndef WIN32
  310. delete [] this->program;
  311. //#endif
  312. this->program = NULL;
  313. }
  314. /**
  315. * Return start index of nth submatch. start(0) is the start of the full match.
  316. */
  317. inline int cmRegularExpression::start(int n) const
  318. {
  319. return this->startp[n] - searchstring;
  320. }
  321. /**
  322. * Return end index of nth submatch. end(0) is the end of the full match.
  323. */
  324. inline int cmRegularExpression::end(int n) const
  325. {
  326. return this->endp[n] - searchstring;
  327. }
  328. /**
  329. * Return nth submatch as a string.
  330. */
  331. inline std::string cmRegularExpression::match(int n) const
  332. {
  333. return std::string(this->startp[n], this->endp[n] - this->startp[n]);
  334. }
  335. #endif // cmRegularExpressionh