cmRegularExpression.cxx 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279
  1. //
  2. // Copyright (C) 1991 Texas Instruments Incorporated.
  3. //
  4. // Permission is granted to any individual or institution to use, copy, modify,
  5. // and distribute this software, provided that this complete copyright and
  6. // permission notice is maintained, intact, in all copies and supporting
  7. // documentation.
  8. //
  9. // Texas Instruments Incorporated provides this software "as is" without
  10. // express or implied warranty.
  11. //
  12. //
  13. // Created: MNF 06/13/89 Initial Design and Implementation
  14. // Updated: LGO 08/09/89 Inherit from Generic
  15. // Updated: MBN 09/07/89 Added conditional exception handling
  16. // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
  17. // Updated: DLS 03/22/91 New lite version
  18. //
  19. // This is the header file for the regular expression class. An object of
  20. // this class contains a regular expression, in a special "compiled" format.
  21. // This compiled format consists of several slots all kept as the objects
  22. // private data. The cmRegularExpression class provides a convenient way to represent
  23. // regular expressions. It makes it easy to search for the same regular
  24. // expression in many different strings without having to compile a string to
  25. // regular expression format more than necessary.
  26. //
  27. // A regular expression allows a programmer to specify complex patterns that
  28. // can be searched for and matched against the character string of a String
  29. // object. In its simplest case, a regular expression is a sequence of
  30. // characters with which you can search for exact character matches. However,
  31. // many times you may not know the exact sequence you want to find, or you may
  32. // only want to find a match at the beginning or end of a String. The cmRegularExpression
  33. // object allows specification of such patterns by utilizing the following
  34. // regular expression meta-characters (note that more one of these
  35. // meta-characters can be used in a single regular expression in order to
  36. // create complex search patterns):
  37. //
  38. // ^ Match at beginning of line
  39. // $ Match at end of line
  40. // . Match any single character
  41. // [ ] Match any one character inside the brackets
  42. // [^ ] Match any character NOT inside the brackets
  43. // - Match any character in range on either side of dash
  44. // * Match preceding pattern zero or more times
  45. // + Match preceding pattern one or more times
  46. // ? Match preceding pattern zero or once only
  47. // () Save a matched expression and use it in a further match.
  48. //
  49. // There are three constructors for cmRegularExpression. One just creates an empty cmRegularExpression
  50. // object. Another creates a cmRegularExpression object and initializes it with a regular
  51. // expression that is given in the form of a char*. The third takes a
  52. // reference to a cmRegularExpression object as an argument and creates an object
  53. // initialized with the information from the given cmRegularExpression object.
  54. //
  55. // The find member function finds the first occurence of the regualr
  56. // expression of that object in the string given to find as an argument. Find
  57. // returns a boolean, and if true, mutates the private data appropriately.
  58. // Find sets pointers to the beginning and end of the thing last found, they
  59. // are pointers into the actual string that was searched. The start and end
  60. // member functions return indicies into the searched string that correspond
  61. // to the beginning and end pointers respectively. The compile member
  62. // function takes a char* and puts the compiled version of the char* argument
  63. // into the object's private data fields. The == and != operators only check
  64. // the to see if the compiled regular expression is the same, and the
  65. // deep_equal functions also checks to see if the start and end pointers are
  66. // the same. The is_valid function returns false if program is set to NULL,
  67. // (i.e. there is no valid compiled exression). The set_invalid function sets
  68. // the program to NULL (Warning: this deletes the compiled expression). The
  69. // following examples may help clarify regular expression usage:
  70. //
  71. // * The regular expression "^hello" matches a "hello" only at the
  72. // beginning of a line. It would match "hello there" but not "hi,
  73. // hello there".
  74. //
  75. // * The regular expression "long$" matches a "long" only at the end
  76. // of a line. It would match "so long\0", but not "long ago".
  77. //
  78. // * The regular expression "t..t..g" will match anything that has a
  79. // "t" then any two characters, another "t", any two characters and
  80. // then a "g". It will match "testing", or "test again" but would
  81. // not match "toasting"
  82. //
  83. // * The regular expression "[1-9ab]" matches any number one through
  84. // nine, and the characters "a" and "b". It would match "hello 1"
  85. // or "begin", but would not match "no-match".
  86. //
  87. // * The regular expression "[^1-9ab]" matches any character that is
  88. // not a number one through nine, or an "a" or "b". It would NOT
  89. // match "hello 1" or "begin", but would match "no-match".
  90. //
  91. // * The regular expression "br* " matches something that begins with
  92. // a "b", is followed by zero or more "r"s, and ends in a space. It
  93. // would match "brrrrr ", and "b ", but would not match "brrh ".
  94. //
  95. // * The regular expression "br+ " matches something that begins with
  96. // a "b", is followed by one or more "r"s, and ends in a space. It
  97. // would match "brrrrr ", and "br ", but would not match "b " or
  98. // "brrh ".
  99. //
  100. // * The regular expression "br? " matches something that begins with
  101. // a "b", is followed by zero or one "r"s, and ends in a space. It
  102. // would match "br ", and "b ", but would not match "brrrr " or
  103. // "brrh ".
  104. //
  105. // * The regular expression "(..p)b" matches something ending with pb
  106. // and beginning with whatever the two characters before the first p
  107. // encounterd in the line were. It would find "repb" in "rep drepa
  108. // qrepb". The regular expression "(..p)a" would find "repa qrepb"
  109. // in "rep drepa qrepb"
  110. //
  111. // * The regular expression "d(..p)" matches something ending with p,
  112. // beginning with d, and having two characters in between that are
  113. // the same as the two characters before the first p encounterd in
  114. // the line. It would match "drepa qrepb" in "rep drepa qrepb".
  115. //
  116. #include "cmRegularExpression.h" // Include class specification
  117. #include <stdio.h>
  118. #include <string>
  119. // cmRegularExpression -- Copies the given regular expression.
  120. cmRegularExpression::cmRegularExpression (const cmRegularExpression& rxp) {
  121. int ind;
  122. this->progsize = rxp.progsize; // Copy regular expression size
  123. this->program = new char[this->progsize]; // Allocate storage
  124. for(ind=this->progsize; ind-- != 0;) // Copy regular expresion
  125. this->program[ind] = rxp.program[ind];
  126. this->startp[0] = rxp.startp[0]; // Copy pointers into last
  127. this->endp[0] = rxp.endp[0]; // Successful "find" operation
  128. this->regmust = rxp.regmust; // Copy field
  129. if (rxp.regmust != NULL) {
  130. char* dum = rxp.program;
  131. ind = 0;
  132. while (dum != rxp.regmust) {
  133. ++dum;
  134. ++ind;
  135. }
  136. this->regmust = this->program + ind;
  137. }
  138. this->regstart = rxp.regstart; // Copy starting index
  139. this->reganch = rxp.reganch; // Copy remaining private data
  140. this->regmlen = rxp.regmlen; // Copy remaining private data
  141. }
  142. // operator== -- Returns true if two regular expressions have the same
  143. // compiled program for pattern matching.
  144. bool cmRegularExpression::operator== (const cmRegularExpression& rxp) const {
  145. if (this != &rxp) { // Same address?
  146. int ind = this->progsize; // Get regular expression size
  147. if (ind != rxp.progsize) // If different size regexp
  148. return false; // Return failure
  149. while(ind-- != 0) // Else while still characters
  150. if(this->program[ind] != rxp.program[ind]) // If regexp are different
  151. return false; // Return failure
  152. }
  153. return true; // Else same, return success
  154. }
  155. // deep_equal -- Returns true if have the same compiled regular expressions
  156. // and the same start and end pointers.
  157. bool cmRegularExpression::deep_equal (const cmRegularExpression& rxp) const {
  158. int ind = this->progsize; // Get regular expression size
  159. if (ind != rxp.progsize) // If different size regexp
  160. return false; // Return failure
  161. while(ind-- != 0) // Else while still characters
  162. if(this->program[ind] != rxp.program[ind]) // If regexp are different
  163. return false; // Return failure
  164. return (this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs,
  165. this->endp[0] == rxp.endp[0]); // Return true
  166. }
  167. // The remaining code in this file is derived from the regular expression code
  168. // whose copyright statement appears below. It has been changed to work
  169. // with the class concepts of C++ and COOL.
  170. /*
  171. * compile and find
  172. *
  173. * Copyright (c) 1986 by University of Toronto.
  174. * Written by Henry Spencer. Not derived from licensed software.
  175. *
  176. * Permission is granted to anyone to use this software for any
  177. * purpose on any computer system, and to redistribute it freely,
  178. * subject to the following restrictions:
  179. *
  180. * 1. The author is not responsible for the consequences of use of
  181. * this software, no matter how awful, even if they arise
  182. * from defects in it.
  183. *
  184. * 2. The origin of this software must not be misrepresented, either
  185. * by explicit claim or by omission.
  186. *
  187. * 3. Altered versions must be plainly marked as such, and must not
  188. * be misrepresented as being the original software.
  189. *
  190. * Beware that some of this code is subtly aware of the way operator
  191. * precedence is structured in regular expressions. Serious changes in
  192. * regular-expression syntax might require a total rethink.
  193. */
  194. /*
  195. * The "internal use only" fields in regexp.h are present to pass info from
  196. * compile to execute that permits the execute phase to run lots faster on
  197. * simple cases. They are:
  198. *
  199. * regstart char that must begin a match; '\0' if none obvious
  200. * reganch is the match anchored (at beginning-of-line only)?
  201. * regmust string (pointer into program) that match must include, or NULL
  202. * regmlen length of regmust string
  203. *
  204. * Regstart and reganch permit very fast decisions on suitable starting points
  205. * for a match, cutting down the work a lot. Regmust permits fast rejection
  206. * of lines that cannot possibly match. The regmust tests are costly enough
  207. * that compile() supplies a regmust only if the r.e. contains something
  208. * potentially expensive (at present, the only such thing detected is * or +
  209. * at the start of the r.e., which can involve a lot of backup). Regmlen is
  210. * supplied because the test in find() needs it and compile() is computing
  211. * it anyway.
  212. */
  213. /*
  214. * Structure for regexp "program". This is essentially a linear encoding
  215. * of a nondeterministic finite-state machine (aka syntax charts or
  216. * "railroad normal form" in parsing technology). Each node is an opcode
  217. * plus a "next" pointer, possibly plus an operand. "Next" pointers of
  218. * all nodes except BRANCH implement concatenation; a "next" pointer with
  219. * a BRANCH on both ends of it is connecting two alternatives. (Here we
  220. * have one of the subtle syntax dependencies: an individual BRANCH (as
  221. * opposed to a collection of them) is never concatenated with anything
  222. * because of operator precedence.) The operand of some types of node is
  223. * a literal string; for others, it is a node leading into a sub-FSM. In
  224. * particular, the operand of a BRANCH node is the first node of the branch.
  225. * (NB this is *not* a tree structure: the tail of the branch connects
  226. * to the thing following the set of BRANCHes.) The opcodes are:
  227. */
  228. // definition number opnd? meaning
  229. #define END 0 // no End of program.
  230. #define BOL 1 // no Match "" at beginning of line.
  231. #define EOL 2 // no Match "" at end of line.
  232. #define ANY 3 // no Match any one character.
  233. #define ANYOF 4 // str Match any character in this string.
  234. #define ANYBUT 5 // str Match any character not in this
  235. // string.
  236. #define BRANCH 6 // node Match this alternative, or the
  237. // next...
  238. #define BACK 7 // no Match "", "next" ptr points backward.
  239. #define EXACTLY 8 // str Match this string.
  240. #define NOTHING 9 // no Match empty string.
  241. #define STAR 10 // node Match this (simple) thing 0 or more
  242. // times.
  243. #define PLUS 11 // node Match this (simple) thing 1 or more
  244. // times.
  245. #define OPEN 20 // no Mark this point in input as start of
  246. // #n.
  247. // OPEN+1 is number 1, etc.
  248. #define CLOSE 30 // no Analogous to OPEN.
  249. /*
  250. * Opcode notes:
  251. *
  252. * BRANCH The set of branches constituting a single choice are hooked
  253. * together with their "next" pointers, since precedence prevents
  254. * anything being concatenated to any individual branch. The
  255. * "next" pointer of the last BRANCH in a choice points to the
  256. * thing following the whole choice. This is also where the
  257. * final "next" pointer of each individual branch points; each
  258. * branch starts with the operand node of a BRANCH node.
  259. *
  260. * BACK Normal "next" pointers all implicitly point forward; BACK
  261. * exists to make loop structures possible.
  262. *
  263. * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
  264. * BRANCH structures using BACK. Simple cases (one character
  265. * per match) are implemented with STAR and PLUS for speed
  266. * and to minimize recursive plunges.
  267. *
  268. * OPEN,CLOSE ...are numbered at compile time.
  269. */
  270. /*
  271. * A node is one char of opcode followed by two chars of "next" pointer.
  272. * "Next" pointers are stored as two 8-bit pieces, high order first. The
  273. * value is a positive offset from the opcode of the node containing it.
  274. * An operand, if any, simply follows the node. (Note that much of the
  275. * code generation knows about this implicit relationship.)
  276. *
  277. * Using two bytes for the "next" pointer is vast overkill for most things,
  278. * but allows patterns to get big without disasters.
  279. */
  280. #define OP(p) (*(p))
  281. #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
  282. #define OPERAND(p) ((p) + 3)
  283. const unsigned char MAGIC = 0234;
  284. /*
  285. * Utility definitions.
  286. */
  287. #define UCHARAT(p) ((const unsigned char*)(p))[0]
  288. #define FAIL(m) { regerror(m); return(NULL); }
  289. #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
  290. #define META "^$.[()|?+*\\"
  291. /*
  292. * Flags to be passed up and down.
  293. */
  294. #define HASWIDTH 01 // Known never to match null string.
  295. #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
  296. #define SPSTART 04 // Starts with * or +.
  297. #define WORST 0 // Worst case.
  298. /////////////////////////////////////////////////////////////////////////
  299. //
  300. // COMPILE AND ASSOCIATED FUNCTIONS
  301. //
  302. /////////////////////////////////////////////////////////////////////////
  303. /*
  304. * Global work variables for compile().
  305. */
  306. static const char* regparse; // Input-scan pointer.
  307. static int regnpar; // () count.
  308. static char regdummy;
  309. static char* regcode; // Code-emit pointer; &regdummy = don't.
  310. static long regsize; // Code size.
  311. /*
  312. * Forward declarations for compile()'s friends.
  313. */
  314. // #ifndef static
  315. // #define static static
  316. // #endif
  317. static char* reg (int, int*);
  318. static char* regbranch (int*);
  319. static char* regpiece (int*);
  320. static char* regatom (int*);
  321. static char* regnode (char);
  322. static const char* regnext (register const char*);
  323. static char* regnext (register char*);
  324. static void regc (unsigned char);
  325. static void reginsert (char, char*);
  326. static void regtail (char*, const char*);
  327. static void regoptail (char*, const char*);
  328. #ifdef STRCSPN
  329. static int strcspn ();
  330. #endif
  331. /*
  332. * We can't allocate space until we know how big the compiled form will be,
  333. * but we can't compile it (and thus know how big it is) until we've got a
  334. * place to put the code. So we cheat: we compile it twice, once with code
  335. * generation turned off and size counting turned on, and once "for real".
  336. * This also means that we don't allocate space until we are sure that the
  337. * thing really will compile successfully, and we never have to move the
  338. * code and thus invalidate pointers into it. (Note that it has to be in
  339. * one piece because free() must be able to free it all.)
  340. *
  341. * Beware that the optimization-preparation code in here knows about some
  342. * of the structure of the compiled regexp.
  343. */
  344. // compile -- compile a regular expression into internal code
  345. // for later pattern matching.
  346. void cmRegularExpression::compile (const char* exp) {
  347. register const char* scan;
  348. register const char* longest;
  349. register unsigned long len;
  350. int flags;
  351. if (exp == NULL) {
  352. //RAISE Error, SYM(cmRegularExpression), SYM(No_Expr),
  353. printf ("cmRegularExpression::compile(): No expression supplied.\n");
  354. return;
  355. }
  356. // First pass: determine size, legality.
  357. regparse = exp;
  358. regnpar = 1;
  359. regsize = 0L;
  360. regcode = &regdummy;
  361. regc(MAGIC);
  362. if(!reg(0, &flags))
  363. {
  364. printf ("cmRegularExpression::compile(): Error in compile.\n");
  365. return;
  366. }
  367. this->startp[0] = this->endp[0] = this->searchstring = NULL;
  368. // Small enough for pointer-storage convention?
  369. if (regsize >= 32767L) { // Probably could be 65535L.
  370. //RAISE Error, SYM(cmRegularExpression), SYM(Expr_Too_Big),
  371. printf ("cmRegularExpression::compile(): Expression too big.\n");
  372. return;
  373. }
  374. // Allocate space.
  375. //#ifndef WIN32
  376. if (this->program != NULL) delete [] this->program;
  377. //#endif
  378. this->program = new char[regsize];
  379. this->progsize = (int) regsize;
  380. if (this->program == NULL) {
  381. //RAISE Error, SYM(cmRegularExpression), SYM(Out_Of_Memory),
  382. printf ("cmRegularExpression::compile(): Out of memory.\n");
  383. return;
  384. }
  385. // Second pass: emit code.
  386. regparse = exp;
  387. regnpar = 1;
  388. regcode = this->program;
  389. regc(MAGIC);
  390. reg(0, &flags);
  391. // Dig out information for optimizations.
  392. this->regstart = '\0'; // Worst-case defaults.
  393. this->reganch = 0;
  394. this->regmust = NULL;
  395. this->regmlen = 0;
  396. scan = this->program + 1; // First BRANCH.
  397. if (OP(regnext(scan)) == END) { // Only one top-level choice.
  398. scan = OPERAND(scan);
  399. // Starting-point info.
  400. if (OP(scan) == EXACTLY)
  401. this->regstart = *OPERAND(scan);
  402. else if (OP(scan) == BOL)
  403. this->reganch++;
  404. //
  405. // If there's something expensive in the r.e., find the longest
  406. // literal string that must appear and make it the regmust. Resolve
  407. // ties in favor of later strings, since the regstart check works
  408. // with the beginning of the r.e. and avoiding duplication
  409. // strengthens checking. Not a strong reason, but sufficient in the
  410. // absence of others.
  411. //
  412. if (flags & SPSTART) {
  413. longest = NULL;
  414. len = 0;
  415. for (; scan != NULL; scan = regnext(scan))
  416. if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
  417. longest = OPERAND(scan);
  418. len = strlen(OPERAND(scan));
  419. }
  420. this->regmust = longest;
  421. this->regmlen = len;
  422. }
  423. }
  424. }
  425. /*
  426. - reg - regular expression, i.e. main body or parenthesized thing
  427. *
  428. * Caller must absorb opening parenthesis.
  429. *
  430. * Combining parenthesis handling with the base level of regular expression
  431. * is a trifle forced, but the need to tie the tails of the branches to what
  432. * follows makes it hard to avoid.
  433. */
  434. static char* reg (int paren, int *flagp) {
  435. register char* ret;
  436. register char* br;
  437. register char* ender;
  438. register int parno =0;
  439. int flags;
  440. *flagp = HASWIDTH; // Tentatively.
  441. // Make an OPEN node, if parenthesized.
  442. if (paren) {
  443. if (regnpar >= NSUBEXP) {
  444. //RAISE Error, SYM(cmRegularExpression), SYM(Too_Many_Parens),
  445. printf ("cmRegularExpression::compile(): Too many parentheses.\n");
  446. return 0;
  447. }
  448. parno = regnpar;
  449. regnpar++;
  450. ret = regnode(OPEN + parno);
  451. }
  452. else
  453. ret = NULL;
  454. // Pick up the branches, linking them together.
  455. br = regbranch(&flags);
  456. if (br == NULL)
  457. return (NULL);
  458. if (ret != NULL)
  459. regtail(ret, br); // OPEN -> first.
  460. else
  461. ret = br;
  462. if (!(flags & HASWIDTH))
  463. *flagp &= ~HASWIDTH;
  464. *flagp |= flags & SPSTART;
  465. while (*regparse == '|') {
  466. regparse++;
  467. br = regbranch(&flags);
  468. if (br == NULL)
  469. return (NULL);
  470. regtail(ret, br); // BRANCH -> BRANCH.
  471. if (!(flags & HASWIDTH))
  472. *flagp &= ~HASWIDTH;
  473. *flagp |= flags & SPSTART;
  474. }
  475. // Make a closing node, and hook it on the end.
  476. ender = regnode((paren) ? CLOSE + parno : END);
  477. regtail(ret, ender);
  478. // Hook the tails of the branches to the closing node.
  479. for (br = ret; br != NULL; br = regnext(br))
  480. regoptail(br, ender);
  481. // Check for proper termination.
  482. if (paren && *regparse++ != ')') {
  483. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
  484. printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
  485. return 0;
  486. }
  487. else if (!paren && *regparse != '\0') {
  488. if (*regparse == ')') {
  489. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
  490. printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
  491. return 0;
  492. }
  493. else {
  494. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  495. printf ("cmRegularExpression::compile(): Internal error.\n");
  496. return 0;
  497. }
  498. // NOTREACHED
  499. }
  500. return (ret);
  501. }
  502. /*
  503. - regbranch - one alternative of an | operator
  504. *
  505. * Implements the concatenation operator.
  506. */
  507. static char* regbranch (int *flagp) {
  508. register char* ret;
  509. register char* chain;
  510. register char* latest;
  511. int flags;
  512. *flagp = WORST; // Tentatively.
  513. ret = regnode(BRANCH);
  514. chain = NULL;
  515. while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
  516. latest = regpiece(&flags);
  517. if (latest == NULL)
  518. return (NULL);
  519. *flagp |= flags & HASWIDTH;
  520. if (chain == NULL) // First piece.
  521. *flagp |= flags & SPSTART;
  522. else
  523. regtail(chain, latest);
  524. chain = latest;
  525. }
  526. if (chain == NULL) // Loop ran zero times.
  527. regnode(NOTHING);
  528. return (ret);
  529. }
  530. /*
  531. - regpiece - something followed by possible [*+?]
  532. *
  533. * Note that the branching code sequences used for ? and the general cases
  534. * of * and + are somewhat optimized: they use the same NOTHING node as
  535. * both the endmarker for their branch list and the body of the last branch.
  536. * It might seem that this node could be dispensed with entirely, but the
  537. * endmarker role is not redundant.
  538. */
  539. static char* regpiece (int *flagp) {
  540. register char* ret;
  541. register char op;
  542. register char* next;
  543. int flags;
  544. ret = regatom(&flags);
  545. if (ret == NULL)
  546. return (NULL);
  547. op = *regparse;
  548. if (!ISMULT(op)) {
  549. *flagp = flags;
  550. return (ret);
  551. }
  552. if (!(flags & HASWIDTH) && op != '?') {
  553. //RAISE Error, SYM(cmRegularExpression), SYM(Empty_Operand),
  554. printf ("cmRegularExpression::compile() : *+ operand could be empty.\n");
  555. return 0;
  556. }
  557. *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
  558. if (op == '*' && (flags & SIMPLE))
  559. reginsert(STAR, ret);
  560. else if (op == '*') {
  561. // Emit x* as (x&|), where & means "self".
  562. reginsert(BRANCH, ret); // Either x
  563. regoptail(ret, regnode(BACK)); // and loop
  564. regoptail(ret, ret); // back
  565. regtail(ret, regnode(BRANCH)); // or
  566. regtail(ret, regnode(NOTHING)); // null.
  567. }
  568. else if (op == '+' && (flags & SIMPLE))
  569. reginsert(PLUS, ret);
  570. else if (op == '+') {
  571. // Emit x+ as x(&|), where & means "self".
  572. next = regnode(BRANCH); // Either
  573. regtail(ret, next);
  574. regtail(regnode(BACK), ret); // loop back
  575. regtail(next, regnode(BRANCH)); // or
  576. regtail(ret, regnode(NOTHING)); // null.
  577. }
  578. else if (op == '?') {
  579. // Emit x? as (x|)
  580. reginsert(BRANCH, ret); // Either x
  581. regtail(ret, regnode(BRANCH)); // or
  582. next = regnode(NOTHING);// null.
  583. regtail(ret, next);
  584. regoptail(ret, next);
  585. }
  586. regparse++;
  587. if (ISMULT(*regparse)) {
  588. //RAISE Error, SYM(cmRegularExpression), SYM(Nested_Operand),
  589. printf ("cmRegularExpression::compile(): Nested *?+.\n");
  590. return 0;
  591. }
  592. return (ret);
  593. }
  594. /*
  595. - regatom - the lowest level
  596. *
  597. * Optimization: gobbles an entire sequence of ordinary characters so that
  598. * it can turn them into a single node, which is smaller to store and
  599. * faster to run. Backslashed characters are exceptions, each becoming a
  600. * separate node; the code is simpler that way and it's not worth fixing.
  601. */
  602. static char* regatom (int *flagp) {
  603. register char* ret;
  604. int flags;
  605. *flagp = WORST; // Tentatively.
  606. switch (*regparse++) {
  607. case '^':
  608. ret = regnode(BOL);
  609. break;
  610. case '$':
  611. ret = regnode(EOL);
  612. break;
  613. case '.':
  614. ret = regnode(ANY);
  615. *flagp |= HASWIDTH | SIMPLE;
  616. break;
  617. case '[':{
  618. register int rxpclass;
  619. register int rxpclassend;
  620. if (*regparse == '^') { // Complement of range.
  621. ret = regnode(ANYBUT);
  622. regparse++;
  623. }
  624. else
  625. ret = regnode(ANYOF);
  626. if (*regparse == ']' || *regparse == '-')
  627. regc(*regparse++);
  628. while (*regparse != '\0' && *regparse != ']') {
  629. if (*regparse == '-') {
  630. regparse++;
  631. if (*regparse == ']' || *regparse == '\0')
  632. regc('-');
  633. else {
  634. rxpclass = UCHARAT(regparse - 2) + 1;
  635. rxpclassend = UCHARAT(regparse);
  636. if (rxpclass > rxpclassend + 1) {
  637. //RAISE Error, SYM(cmRegularExpression), SYM(Invalid_Range),
  638. printf ("cmRegularExpression::compile(): Invalid range in [].\n");
  639. return 0;
  640. }
  641. for (; rxpclass <= rxpclassend; rxpclass++)
  642. regc(rxpclass);
  643. regparse++;
  644. }
  645. }
  646. else
  647. regc(*regparse++);
  648. }
  649. regc('\0');
  650. if (*regparse != ']') {
  651. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Bracket),
  652. printf ("cmRegularExpression::compile(): Unmatched [].\n");
  653. return 0;
  654. }
  655. regparse++;
  656. *flagp |= HASWIDTH | SIMPLE;
  657. }
  658. break;
  659. case '(':
  660. ret = reg(1, &flags);
  661. if (ret == NULL)
  662. return (NULL);
  663. *flagp |= flags & (HASWIDTH | SPSTART);
  664. break;
  665. case '\0':
  666. case '|':
  667. case ')':
  668. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  669. printf ("cmRegularExpression::compile(): Internal error.\n"); // Never here
  670. return 0;
  671. case '?':
  672. case '+':
  673. case '*':
  674. //RAISE Error, SYM(cmRegularExpression), SYM(No_Operand),
  675. printf ("cmRegularExpression::compile(): ?+* follows nothing.\n");
  676. return 0;
  677. case '\\':
  678. if (*regparse == '\0') {
  679. //RAISE Error, SYM(cmRegularExpression), SYM(Trailing_Backslash),
  680. printf ("cmRegularExpression::compile(): Trailing backslash.\n");
  681. return 0;
  682. }
  683. ret = regnode(EXACTLY);
  684. regc(*regparse++);
  685. regc('\0');
  686. *flagp |= HASWIDTH | SIMPLE;
  687. break;
  688. default:{
  689. register int len;
  690. register char ender;
  691. regparse--;
  692. len = strcspn(regparse, META);
  693. if (len <= 0) {
  694. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  695. printf ("cmRegularExpression::compile(): Internal error.\n");
  696. return 0;
  697. }
  698. ender = *(regparse + len);
  699. if (len > 1 && ISMULT(ender))
  700. len--; // Back off clear of ?+* operand.
  701. *flagp |= HASWIDTH;
  702. if (len == 1)
  703. *flagp |= SIMPLE;
  704. ret = regnode(EXACTLY);
  705. while (len > 0) {
  706. regc(*regparse++);
  707. len--;
  708. }
  709. regc('\0');
  710. }
  711. break;
  712. }
  713. return (ret);
  714. }
  715. /*
  716. - regnode - emit a node
  717. Location.
  718. */
  719. static char* regnode (char op) {
  720. register char* ret;
  721. register char* ptr;
  722. ret = regcode;
  723. if (ret == &regdummy) {
  724. regsize += 3;
  725. return (ret);
  726. }
  727. ptr = ret;
  728. *ptr++ = op;
  729. *ptr++ = '\0'; // Null "next" pointer.
  730. *ptr++ = '\0';
  731. regcode = ptr;
  732. return (ret);
  733. }
  734. /*
  735. - regc - emit (if appropriate) a byte of code
  736. */
  737. static void regc (unsigned char b) {
  738. if (regcode != &regdummy)
  739. *regcode++ = b;
  740. else
  741. regsize++;
  742. }
  743. /*
  744. - reginsert - insert an operator in front of already-emitted operand
  745. *
  746. * Means relocating the operand.
  747. */
  748. static void reginsert (char op, char* opnd) {
  749. register char* src;
  750. register char* dst;
  751. register char* place;
  752. if (regcode == &regdummy) {
  753. regsize += 3;
  754. return;
  755. }
  756. src = regcode;
  757. regcode += 3;
  758. dst = regcode;
  759. while (src > opnd)
  760. *--dst = *--src;
  761. place = opnd; // Op node, where operand used to be.
  762. *place++ = op;
  763. *place++ = '\0';
  764. *place++ = '\0';
  765. }
  766. /*
  767. - regtail - set the next-pointer at the end of a node chain
  768. */
  769. static void regtail (char* p, const char* val) {
  770. register char* scan;
  771. register char* temp;
  772. register int offset;
  773. if (p == &regdummy)
  774. return;
  775. // Find last node.
  776. scan = p;
  777. for (;;) {
  778. temp = regnext(scan);
  779. if (temp == NULL)
  780. break;
  781. scan = temp;
  782. }
  783. if (OP(scan) == BACK)
  784. offset = (const char*)scan - val;
  785. else
  786. offset = val - scan;
  787. *(scan + 1) = (offset >> 8) & 0377;
  788. *(scan + 2) = offset & 0377;
  789. }
  790. /*
  791. - regoptail - regtail on operand of first argument; nop if operandless
  792. */
  793. static void regoptail (char* p, const char* val) {
  794. // "Operandless" and "op != BRANCH" are synonymous in practice.
  795. if (p == NULL || p == &regdummy || OP(p) != BRANCH)
  796. return;
  797. regtail(OPERAND(p), val);
  798. }
  799. ////////////////////////////////////////////////////////////////////////
  800. //
  801. // find and friends
  802. //
  803. ////////////////////////////////////////////////////////////////////////
  804. /*
  805. * Global work variables for find().
  806. */
  807. static const char* reginput; // String-input pointer.
  808. static const char* regbol; // Beginning of input, for ^ check.
  809. static const char* *regstartp; // Pointer to startp array.
  810. static const char* *regendp; // Ditto for endp.
  811. /*
  812. * Forwards.
  813. */
  814. static int regtry (const char*, const char* *,
  815. const char* *, const char*);
  816. static int regmatch (const char*);
  817. static int regrepeat (const char*);
  818. #ifdef DEBUG
  819. int regnarrate = 0;
  820. void regdump ();
  821. static char* regprop ();
  822. #endif
  823. bool cmRegularExpression::find (std::string const& s) {
  824. return find(s.c_str());
  825. }
  826. // find -- Matches the regular expression to the given string.
  827. // Returns true if found, and sets start and end indexes accordingly.
  828. bool cmRegularExpression::find (const char* string) {
  829. register const char* s;
  830. this->searchstring = string;
  831. // Check validity of program.
  832. if (!this->program || UCHARAT(this->program) != MAGIC) {
  833. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  834. printf ("cmRegularExpression::find(): Compiled regular expression corrupted.\n");
  835. return 0;
  836. }
  837. // If there is a "must appear" string, look for it.
  838. if (this->regmust != NULL) {
  839. s = string;
  840. while ((s = strchr(s, this->regmust[0])) != NULL) {
  841. if (strncmp(s, this->regmust, this->regmlen) == 0)
  842. break; // Found it.
  843. s++;
  844. }
  845. if (s == NULL) // Not present.
  846. return (0);
  847. }
  848. // Mark beginning of line for ^ .
  849. regbol = string;
  850. // Simplest case: anchored match need be tried only once.
  851. if (this->reganch)
  852. return (regtry(string, this->startp, this->endp, this->program) != 0);
  853. // Messy cases: unanchored match.
  854. s = string;
  855. if (this->regstart != '\0')
  856. // We know what char it must start with.
  857. while ((s = strchr(s, this->regstart)) != NULL) {
  858. if (regtry(s, this->startp, this->endp, this->program))
  859. return (1);
  860. s++;
  861. }
  862. else
  863. // We don't -- general case.
  864. do {
  865. if (regtry(s, this->startp, this->endp, this->program))
  866. return (1);
  867. } while (*s++ != '\0');
  868. // Failure.
  869. return (0);
  870. }
  871. /*
  872. - regtry - try match at specific point
  873. 0 failure, 1 success
  874. */
  875. static int regtry (const char* string, const char* *start,
  876. const char* *end, const char* prog) {
  877. register int i;
  878. register const char* *sp1;
  879. register const char* *ep;
  880. reginput = string;
  881. regstartp = start;
  882. regendp = end;
  883. sp1 = start;
  884. ep = end;
  885. for (i = NSUBEXP; i > 0; i--) {
  886. *sp1++ = NULL;
  887. *ep++ = NULL;
  888. }
  889. if (regmatch(prog + 1)) {
  890. start[0] = string;
  891. end[0] = reginput;
  892. return (1);
  893. }
  894. else
  895. return (0);
  896. }
  897. /*
  898. - regmatch - main matching routine
  899. *
  900. * Conceptually the strategy is simple: check to see whether the current
  901. * node matches, call self recursively to see whether the rest matches,
  902. * and then act accordingly. In practice we make some effort to avoid
  903. * recursion, in particular by going through "ordinary" nodes (that don't
  904. * need to know whether the rest of the match failed) by a loop instead of
  905. * by recursion.
  906. * 0 failure, 1 success
  907. */
  908. static int regmatch (const char* prog) {
  909. register const char* scan; // Current node.
  910. const char* next; // Next node.
  911. scan = prog;
  912. while (scan != NULL) {
  913. next = regnext(scan);
  914. switch (OP(scan)) {
  915. case BOL:
  916. if (reginput != regbol)
  917. return (0);
  918. break;
  919. case EOL:
  920. if (*reginput != '\0')
  921. return (0);
  922. break;
  923. case ANY:
  924. if (*reginput == '\0')
  925. return (0);
  926. reginput++;
  927. break;
  928. case EXACTLY:{
  929. register int len;
  930. register const char* opnd;
  931. opnd = OPERAND(scan);
  932. // Inline the first character, for speed.
  933. if (*opnd != *reginput)
  934. return (0);
  935. len = strlen(opnd);
  936. if (len > 1 && strncmp(opnd, reginput, len) != 0)
  937. return (0);
  938. reginput += len;
  939. }
  940. break;
  941. case ANYOF:
  942. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
  943. return (0);
  944. reginput++;
  945. break;
  946. case ANYBUT:
  947. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
  948. return (0);
  949. reginput++;
  950. break;
  951. case NOTHING:
  952. break;
  953. case BACK:
  954. break;
  955. case OPEN + 1:
  956. case OPEN + 2:
  957. case OPEN + 3:
  958. case OPEN + 4:
  959. case OPEN + 5:
  960. case OPEN + 6:
  961. case OPEN + 7:
  962. case OPEN + 8:
  963. case OPEN + 9:{
  964. register int no;
  965. register const char* save;
  966. no = OP(scan) - OPEN;
  967. save = reginput;
  968. if (regmatch(next)) {
  969. //
  970. // Don't set startp if some later invocation of the
  971. // same parentheses already has.
  972. //
  973. if (regstartp[no] == NULL)
  974. regstartp[no] = save;
  975. return (1);
  976. }
  977. else
  978. return (0);
  979. }
  980. // break;
  981. case CLOSE + 1:
  982. case CLOSE + 2:
  983. case CLOSE + 3:
  984. case CLOSE + 4:
  985. case CLOSE + 5:
  986. case CLOSE + 6:
  987. case CLOSE + 7:
  988. case CLOSE + 8:
  989. case CLOSE + 9:{
  990. register int no;
  991. register const char* save;
  992. no = OP(scan) - CLOSE;
  993. save = reginput;
  994. if (regmatch(next)) {
  995. //
  996. // Don't set endp if some later invocation of the
  997. // same parentheses already has.
  998. //
  999. if (regendp[no] == NULL)
  1000. regendp[no] = save;
  1001. return (1);
  1002. }
  1003. else
  1004. return (0);
  1005. }
  1006. // break;
  1007. case BRANCH:{
  1008. register const char* save;
  1009. if (OP(next) != BRANCH) // No choice.
  1010. next = OPERAND(scan); // Avoid recursion.
  1011. else {
  1012. do {
  1013. save = reginput;
  1014. if (regmatch(OPERAND(scan)))
  1015. return (1);
  1016. reginput = save;
  1017. scan = regnext(scan);
  1018. } while (scan != NULL && OP(scan) == BRANCH);
  1019. return (0);
  1020. // NOTREACHED
  1021. }
  1022. }
  1023. break;
  1024. case STAR:
  1025. case PLUS:{
  1026. register char nextch;
  1027. register int no;
  1028. register const char* save;
  1029. register int min_no;
  1030. //
  1031. // Lookahead to avoid useless match attempts when we know
  1032. // what character comes next.
  1033. //
  1034. nextch = '\0';
  1035. if (OP(next) == EXACTLY)
  1036. nextch = *OPERAND(next);
  1037. min_no = (OP(scan) == STAR) ? 0 : 1;
  1038. save = reginput;
  1039. no = regrepeat(OPERAND(scan));
  1040. while (no >= min_no) {
  1041. // If it could work, try it.
  1042. if (nextch == '\0' || *reginput == nextch)
  1043. if (regmatch(next))
  1044. return (1);
  1045. // Couldn't or didn't -- back up.
  1046. no--;
  1047. reginput = save + no;
  1048. }
  1049. return (0);
  1050. }
  1051. // break;
  1052. case END:
  1053. return (1); // Success!
  1054. default:
  1055. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  1056. printf ("cmRegularExpression::find(): Internal error -- memory corrupted.\n");
  1057. return 0;
  1058. }
  1059. scan = next;
  1060. }
  1061. //
  1062. // We get here only if there's trouble -- normally "case END" is the
  1063. // terminating point.
  1064. //
  1065. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  1066. printf ("cmRegularExpression::find(): Internal error -- corrupted pointers.\n");
  1067. return (0);
  1068. }
  1069. /*
  1070. - regrepeat - repeatedly match something simple, report how many
  1071. */
  1072. static int regrepeat (const char* p) {
  1073. register int count = 0;
  1074. register const char* scan;
  1075. register const char* opnd;
  1076. scan = reginput;
  1077. opnd = OPERAND(p);
  1078. switch (OP(p)) {
  1079. case ANY:
  1080. count = strlen(scan);
  1081. scan += count;
  1082. break;
  1083. case EXACTLY:
  1084. while (*opnd == *scan) {
  1085. count++;
  1086. scan++;
  1087. }
  1088. break;
  1089. case ANYOF:
  1090. while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
  1091. count++;
  1092. scan++;
  1093. }
  1094. break;
  1095. case ANYBUT:
  1096. while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
  1097. count++;
  1098. scan++;
  1099. }
  1100. break;
  1101. default: // Oh dear. Called inappropriately.
  1102. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  1103. printf ("cm RegularExpression::find(): Internal error.\n");
  1104. return 0;
  1105. }
  1106. reginput = scan;
  1107. return (count);
  1108. }
  1109. /*
  1110. - regnext - dig the "next" pointer out of a node
  1111. */
  1112. static const char* regnext (register const char* p) {
  1113. register int offset;
  1114. if (p == &regdummy)
  1115. return (NULL);
  1116. offset = NEXT(p);
  1117. if (offset == 0)
  1118. return (NULL);
  1119. if (OP(p) == BACK)
  1120. return (p - offset);
  1121. else
  1122. return (p + offset);
  1123. }
  1124. static char* regnext (register char* p) {
  1125. register int offset;
  1126. if (p == &regdummy)
  1127. return (NULL);
  1128. offset = NEXT(p);
  1129. if (offset == 0)
  1130. return (NULL);
  1131. if (OP(p) == BACK)
  1132. return (p - offset);
  1133. else
  1134. return (p + offset);
  1135. }