cmRegularExpression.cxx 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220
  1. /*=========================================================================
  2. Program: Insight Segmentation & Registration Toolkit
  3. Module: $RCSfile$
  4. Language: C++
  5. Date: $Date$
  6. Version: $Revision$
  7. Copyright (c) 2001 Insight Consortium
  8. All rights reserved.
  9. Redistribution and use in source and binary forms, with or without
  10. modification, are permitted provided that the following conditions are met:
  11. * Redistributions of source code must retain the above copyright notice,
  12. this list of conditions and the following disclaimer.
  13. * Redistributions in binary form must reproduce the above copyright notice,
  14. this list of conditions and the following disclaimer in the documentation
  15. and/or other materials provided with the distribution.
  16. * The name of the Insight Consortium, nor the names of any consortium members,
  17. nor of any contributors, may be used to endorse or promote products derived
  18. from this software without specific prior written permission.
  19. * Modified source versions must be plainly marked as such, and must not be
  20. misrepresented as being the original software.
  21. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
  22. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24. ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
  25. ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  27. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  28. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  29. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. =========================================================================*/
  32. //
  33. // Copyright (C) 1991 Texas Instruments Incorporated.
  34. //
  35. // Permission is granted to any individual or institution to use, copy, modify,
  36. // and distribute this software, provided that this complete copyright and
  37. // permission notice is maintained, intact, in all copies and supporting
  38. // documentation.
  39. //
  40. // Texas Instruments Incorporated provides this software "as is" without
  41. // express or implied warranty.
  42. //
  43. //
  44. // Created: MNF 06/13/89 Initial Design and Implementation
  45. // Updated: LGO 08/09/89 Inherit from Generic
  46. // Updated: MBN 09/07/89 Added conditional exception handling
  47. // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
  48. // Updated: DLS 03/22/91 New lite version
  49. //
  50. #include "cmRegularExpression.h" // Include class specification
  51. #include "cmStandardIncludes.h"
  52. #include <stdio.h>
  53. // cmRegularExpression -- Copies the given regular expression.
  54. cmRegularExpression::cmRegularExpression (const cmRegularExpression& rxp) {
  55. int ind;
  56. this->progsize = rxp.progsize; // Copy regular expression size
  57. this->program = new char[this->progsize]; // Allocate storage
  58. for(ind=this->progsize; ind-- != 0;) // Copy regular expresion
  59. this->program[ind] = rxp.program[ind];
  60. this->startp[0] = rxp.startp[0]; // Copy pointers into last
  61. this->endp[0] = rxp.endp[0]; // Successful "find" operation
  62. this->regmust = rxp.regmust; // Copy field
  63. if (rxp.regmust != NULL) {
  64. char* dum = rxp.program;
  65. ind = 0;
  66. while (dum != rxp.regmust) {
  67. ++dum;
  68. ++ind;
  69. }
  70. this->regmust = this->program + ind;
  71. }
  72. this->regstart = rxp.regstart; // Copy starting index
  73. this->reganch = rxp.reganch; // Copy remaining private data
  74. this->regmlen = rxp.regmlen; // Copy remaining private data
  75. }
  76. // operator== -- Returns true if two regular expressions have the same
  77. // compiled program for pattern matching.
  78. bool cmRegularExpression::operator== (const cmRegularExpression& rxp) const {
  79. if (this != &rxp) { // Same address?
  80. int ind = this->progsize; // Get regular expression size
  81. if (ind != rxp.progsize) // If different size regexp
  82. return false; // Return failure
  83. while(ind-- != 0) // Else while still characters
  84. if(this->program[ind] != rxp.program[ind]) // If regexp are different
  85. return false; // Return failure
  86. }
  87. return true; // Else same, return success
  88. }
  89. // deep_equal -- Returns true if have the same compiled regular expressions
  90. // and the same start and end pointers.
  91. bool cmRegularExpression::deep_equal (const cmRegularExpression& rxp) const {
  92. int ind = this->progsize; // Get regular expression size
  93. if (ind != rxp.progsize) // If different size regexp
  94. return false; // Return failure
  95. while(ind-- != 0) // Else while still characters
  96. if(this->program[ind] != rxp.program[ind]) // If regexp are different
  97. return false; // Return failure
  98. return (this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs,
  99. this->endp[0] == rxp.endp[0]); // Return true
  100. }
  101. // The remaining code in this file is derived from the regular expression code
  102. // whose copyright statement appears below. It has been changed to work
  103. // with the class concepts of C++ and COOL.
  104. /*
  105. * compile and find
  106. *
  107. * Copyright (c) 1986 by University of Toronto.
  108. * Written by Henry Spencer. Not derived from licensed software.
  109. *
  110. * Permission is granted to anyone to use this software for any
  111. * purpose on any computer system, and to redistribute it freely,
  112. * subject to the following restrictions:
  113. *
  114. * 1. The author is not responsible for the consequences of use of
  115. * this software, no matter how awful, even if they arise
  116. * from defects in it.
  117. *
  118. * 2. The origin of this software must not be misrepresented, either
  119. * by explicit claim or by omission.
  120. *
  121. * 3. Altered versions must be plainly marked as such, and must not
  122. * be misrepresented as being the original software.
  123. *
  124. * Beware that some of this code is subtly aware of the way operator
  125. * precedence is structured in regular expressions. Serious changes in
  126. * regular-expression syntax might require a total rethink.
  127. */
  128. /*
  129. * The "internal use only" fields in regexp.h are present to pass info from
  130. * compile to execute that permits the execute phase to run lots faster on
  131. * simple cases. They are:
  132. *
  133. * regstart char that must begin a match; '\0' if none obvious
  134. * reganch is the match anchored (at beginning-of-line only)?
  135. * regmust string (pointer into program) that match must include, or NULL
  136. * regmlen length of regmust string
  137. *
  138. * Regstart and reganch permit very fast decisions on suitable starting points
  139. * for a match, cutting down the work a lot. Regmust permits fast rejection
  140. * of lines that cannot possibly match. The regmust tests are costly enough
  141. * that compile() supplies a regmust only if the r.e. contains something
  142. * potentially expensive (at present, the only such thing detected is * or +
  143. * at the start of the r.e., which can involve a lot of backup). Regmlen is
  144. * supplied because the test in find() needs it and compile() is computing
  145. * it anyway.
  146. */
  147. /*
  148. * Structure for regexp "program". This is essentially a linear encoding
  149. * of a nondeterministic finite-state machine (aka syntax charts or
  150. * "railroad normal form" in parsing technology). Each node is an opcode
  151. * plus a "next" pointer, possibly plus an operand. "Next" pointers of
  152. * all nodes except BRANCH implement concatenation; a "next" pointer with
  153. * a BRANCH on both ends of it is connecting two alternatives. (Here we
  154. * have one of the subtle syntax dependencies: an individual BRANCH (as
  155. * opposed to a collection of them) is never concatenated with anything
  156. * because of operator precedence.) The operand of some types of node is
  157. * a literal string; for others, it is a node leading into a sub-FSM. In
  158. * particular, the operand of a BRANCH node is the first node of the branch.
  159. * (NB this is *not* a tree structure: the tail of the branch connects
  160. * to the thing following the set of BRANCHes.) The opcodes are:
  161. */
  162. // definition number opnd? meaning
  163. #define END 0 // no End of program.
  164. #define BOL 1 // no Match "" at beginning of line.
  165. #define EOL 2 // no Match "" at end of line.
  166. #define ANY 3 // no Match any one character.
  167. #define ANYOF 4 // str Match any character in this string.
  168. #define ANYBUT 5 // str Match any character not in this
  169. // string.
  170. #define BRANCH 6 // node Match this alternative, or the
  171. // next...
  172. #define BACK 7 // no Match "", "next" ptr points backward.
  173. #define EXACTLY 8 // str Match this string.
  174. #define NOTHING 9 // no Match empty string.
  175. #define STAR 10 // node Match this (simple) thing 0 or more
  176. // times.
  177. #define PLUS 11 // node Match this (simple) thing 1 or more
  178. // times.
  179. #define OPEN 20 // no Mark this point in input as start of
  180. // #n.
  181. // OPEN+1 is number 1, etc.
  182. #define CLOSE 30 // no Analogous to OPEN.
  183. /*
  184. * Opcode notes:
  185. *
  186. * BRANCH The set of branches constituting a single choice are hooked
  187. * together with their "next" pointers, since precedence prevents
  188. * anything being concatenated to any individual branch. The
  189. * "next" pointer of the last BRANCH in a choice points to the
  190. * thing following the whole choice. This is also where the
  191. * final "next" pointer of each individual branch points; each
  192. * branch starts with the operand node of a BRANCH node.
  193. *
  194. * BACK Normal "next" pointers all implicitly point forward; BACK
  195. * exists to make loop structures possible.
  196. *
  197. * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
  198. * BRANCH structures using BACK. Simple cases (one character
  199. * per match) are implemented with STAR and PLUS for speed
  200. * and to minimize recursive plunges.
  201. *
  202. * OPEN,CLOSE ...are numbered at compile time.
  203. */
  204. /*
  205. * A node is one char of opcode followed by two chars of "next" pointer.
  206. * "Next" pointers are stored as two 8-bit pieces, high order first. The
  207. * value is a positive offset from the opcode of the node containing it.
  208. * An operand, if any, simply follows the node. (Note that much of the
  209. * code generation knows about this implicit relationship.)
  210. *
  211. * Using two bytes for the "next" pointer is vast overkill for most things,
  212. * but allows patterns to get big without disasters.
  213. */
  214. #define OP(p) (*(p))
  215. #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
  216. #define OPERAND(p) ((p) + 3)
  217. const unsigned char MAGIC = 0234;
  218. /*
  219. * Utility definitions.
  220. */
  221. #define UCHARAT(p) ((const unsigned char*)(p))[0]
  222. #define FAIL(m) { regerror(m); return(NULL); }
  223. #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
  224. #define META "^$.[()|?+*\\"
  225. /*
  226. * Flags to be passed up and down.
  227. */
  228. #define HASWIDTH 01 // Known never to match null string.
  229. #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
  230. #define SPSTART 04 // Starts with * or +.
  231. #define WORST 0 // Worst case.
  232. /////////////////////////////////////////////////////////////////////////
  233. //
  234. // COMPILE AND ASSOCIATED FUNCTIONS
  235. //
  236. /////////////////////////////////////////////////////////////////////////
  237. /*
  238. * Global work variables for compile().
  239. */
  240. static const char* regparse; // Input-scan pointer.
  241. static int regnpar; // () count.
  242. static char regdummy;
  243. static char* regcode; // Code-emit pointer; &regdummy = don't.
  244. static long regsize; // Code size.
  245. /*
  246. * Forward declarations for compile()'s friends.
  247. */
  248. // #ifndef static
  249. // #define static static
  250. // #endif
  251. static char* reg (int, int*);
  252. static char* regbranch (int*);
  253. static char* regpiece (int*);
  254. static char* regatom (int*);
  255. static char* regnode (char);
  256. static const char* regnext (register const char*);
  257. static char* regnext (register char*);
  258. static void regc (unsigned char);
  259. static void reginsert (char, char*);
  260. static void regtail (char*, const char*);
  261. static void regoptail (char*, const char*);
  262. #ifdef STRCSPN
  263. static int strcspn ();
  264. #endif
  265. /*
  266. * We can't allocate space until we know how big the compiled form will be,
  267. * but we can't compile it (and thus know how big it is) until we've got a
  268. * place to put the code. So we cheat: we compile it twice, once with code
  269. * generation turned off and size counting turned on, and once "for real".
  270. * This also means that we don't allocate space until we are sure that the
  271. * thing really will compile successfully, and we never have to move the
  272. * code and thus invalidate pointers into it. (Note that it has to be in
  273. * one piece because free() must be able to free it all.)
  274. *
  275. * Beware that the optimization-preparation code in here knows about some
  276. * of the structure of the compiled regexp.
  277. */
  278. // compile -- compile a regular expression into internal code
  279. // for later pattern matching.
  280. void cmRegularExpression::compile (const char* exp) {
  281. register const char* scan;
  282. register const char* longest;
  283. register unsigned long len;
  284. int flags;
  285. if (exp == NULL) {
  286. //RAISE Error, SYM(cmRegularExpression), SYM(No_Expr),
  287. printf ("cmRegularExpression::compile(): No expression supplied.\n");
  288. return;
  289. }
  290. // First pass: determine size, legality.
  291. regparse = exp;
  292. regnpar = 1;
  293. regsize = 0L;
  294. regcode = &regdummy;
  295. regc(MAGIC);
  296. if(!reg(0, &flags))
  297. {
  298. printf ("cmRegularExpression::compile(): Error in compile.\n");
  299. return;
  300. }
  301. this->startp[0] = this->endp[0] = this->searchstring = NULL;
  302. // Small enough for pointer-storage convention?
  303. if (regsize >= 32767L) { // Probably could be 65535L.
  304. //RAISE Error, SYM(cmRegularExpression), SYM(Expr_Too_Big),
  305. printf ("cmRegularExpression::compile(): Expression too big.\n");
  306. return;
  307. }
  308. // Allocate space.
  309. //#ifndef WIN32
  310. if (this->program != NULL) delete [] this->program;
  311. //#endif
  312. this->program = new char[regsize];
  313. this->progsize = (int) regsize;
  314. if (this->program == NULL) {
  315. //RAISE Error, SYM(cmRegularExpression), SYM(Out_Of_Memory),
  316. printf ("cmRegularExpression::compile(): Out of memory.\n");
  317. return;
  318. }
  319. // Second pass: emit code.
  320. regparse = exp;
  321. regnpar = 1;
  322. regcode = this->program;
  323. regc(MAGIC);
  324. reg(0, &flags);
  325. // Dig out information for optimizations.
  326. this->regstart = '\0'; // Worst-case defaults.
  327. this->reganch = 0;
  328. this->regmust = NULL;
  329. this->regmlen = 0;
  330. scan = this->program + 1; // First BRANCH.
  331. if (OP(regnext(scan)) == END) { // Only one top-level choice.
  332. scan = OPERAND(scan);
  333. // Starting-point info.
  334. if (OP(scan) == EXACTLY)
  335. this->regstart = *OPERAND(scan);
  336. else if (OP(scan) == BOL)
  337. this->reganch++;
  338. //
  339. // If there's something expensive in the r.e., find the longest
  340. // literal string that must appear and make it the regmust. Resolve
  341. // ties in favor of later strings, since the regstart check works
  342. // with the beginning of the r.e. and avoiding duplication
  343. // strengthens checking. Not a strong reason, but sufficient in the
  344. // absence of others.
  345. //
  346. if (flags & SPSTART) {
  347. longest = NULL;
  348. len = 0;
  349. for (; scan != NULL; scan = regnext(scan))
  350. if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
  351. longest = OPERAND(scan);
  352. len = strlen(OPERAND(scan));
  353. }
  354. this->regmust = longest;
  355. this->regmlen = len;
  356. }
  357. }
  358. }
  359. /*
  360. - reg - regular expression, i.e. main body or parenthesized thing
  361. *
  362. * Caller must absorb opening parenthesis.
  363. *
  364. * Combining parenthesis handling with the base level of regular expression
  365. * is a trifle forced, but the need to tie the tails of the branches to what
  366. * follows makes it hard to avoid.
  367. */
  368. static char* reg (int paren, int *flagp) {
  369. register char* ret;
  370. register char* br;
  371. register char* ender;
  372. register int parno =0;
  373. int flags;
  374. *flagp = HASWIDTH; // Tentatively.
  375. // Make an OPEN node, if parenthesized.
  376. if (paren) {
  377. if (regnpar >= NSUBEXP) {
  378. //RAISE Error, SYM(cmRegularExpression), SYM(Too_Many_Parens),
  379. printf ("cmRegularExpression::compile(): Too many parentheses.\n");
  380. return 0;
  381. }
  382. parno = regnpar;
  383. regnpar++;
  384. ret = regnode(OPEN + parno);
  385. }
  386. else
  387. ret = NULL;
  388. // Pick up the branches, linking them together.
  389. br = regbranch(&flags);
  390. if (br == NULL)
  391. return (NULL);
  392. if (ret != NULL)
  393. regtail(ret, br); // OPEN -> first.
  394. else
  395. ret = br;
  396. if (!(flags & HASWIDTH))
  397. *flagp &= ~HASWIDTH;
  398. *flagp |= flags & SPSTART;
  399. while (*regparse == '|') {
  400. regparse++;
  401. br = regbranch(&flags);
  402. if (br == NULL)
  403. return (NULL);
  404. regtail(ret, br); // BRANCH -> BRANCH.
  405. if (!(flags & HASWIDTH))
  406. *flagp &= ~HASWIDTH;
  407. *flagp |= flags & SPSTART;
  408. }
  409. // Make a closing node, and hook it on the end.
  410. ender = regnode((paren) ? CLOSE + parno : END);
  411. regtail(ret, ender);
  412. // Hook the tails of the branches to the closing node.
  413. for (br = ret; br != NULL; br = regnext(br))
  414. regoptail(br, ender);
  415. // Check for proper termination.
  416. if (paren && *regparse++ != ')') {
  417. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
  418. printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
  419. return 0;
  420. }
  421. else if (!paren && *regparse != '\0') {
  422. if (*regparse == ')') {
  423. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
  424. printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
  425. return 0;
  426. }
  427. else {
  428. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  429. printf ("cmRegularExpression::compile(): Internal error.\n");
  430. return 0;
  431. }
  432. // NOTREACHED
  433. }
  434. return (ret);
  435. }
  436. /*
  437. - regbranch - one alternative of an | operator
  438. *
  439. * Implements the concatenation operator.
  440. */
  441. static char* regbranch (int *flagp) {
  442. register char* ret;
  443. register char* chain;
  444. register char* latest;
  445. int flags;
  446. *flagp = WORST; // Tentatively.
  447. ret = regnode(BRANCH);
  448. chain = NULL;
  449. while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
  450. latest = regpiece(&flags);
  451. if (latest == NULL)
  452. return (NULL);
  453. *flagp |= flags & HASWIDTH;
  454. if (chain == NULL) // First piece.
  455. *flagp |= flags & SPSTART;
  456. else
  457. regtail(chain, latest);
  458. chain = latest;
  459. }
  460. if (chain == NULL) // Loop ran zero times.
  461. regnode(NOTHING);
  462. return (ret);
  463. }
  464. /*
  465. - regpiece - something followed by possible [*+?]
  466. *
  467. * Note that the branching code sequences used for ? and the general cases
  468. * of * and + are somewhat optimized: they use the same NOTHING node as
  469. * both the endmarker for their branch list and the body of the last branch.
  470. * It might seem that this node could be dispensed with entirely, but the
  471. * endmarker role is not redundant.
  472. */
  473. static char* regpiece (int *flagp) {
  474. register char* ret;
  475. register char op;
  476. register char* next;
  477. int flags;
  478. ret = regatom(&flags);
  479. if (ret == NULL)
  480. return (NULL);
  481. op = *regparse;
  482. if (!ISMULT(op)) {
  483. *flagp = flags;
  484. return (ret);
  485. }
  486. if (!(flags & HASWIDTH) && op != '?') {
  487. //RAISE Error, SYM(cmRegularExpression), SYM(Empty_Operand),
  488. printf ("cmRegularExpression::compile() : *+ operand could be empty.\n");
  489. return 0;
  490. }
  491. *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
  492. if (op == '*' && (flags & SIMPLE))
  493. reginsert(STAR, ret);
  494. else if (op == '*') {
  495. // Emit x* as (x&|), where & means "self".
  496. reginsert(BRANCH, ret); // Either x
  497. regoptail(ret, regnode(BACK)); // and loop
  498. regoptail(ret, ret); // back
  499. regtail(ret, regnode(BRANCH)); // or
  500. regtail(ret, regnode(NOTHING)); // null.
  501. }
  502. else if (op == '+' && (flags & SIMPLE))
  503. reginsert(PLUS, ret);
  504. else if (op == '+') {
  505. // Emit x+ as x(&|), where & means "self".
  506. next = regnode(BRANCH); // Either
  507. regtail(ret, next);
  508. regtail(regnode(BACK), ret); // loop back
  509. regtail(next, regnode(BRANCH)); // or
  510. regtail(ret, regnode(NOTHING)); // null.
  511. }
  512. else if (op == '?') {
  513. // Emit x? as (x|)
  514. reginsert(BRANCH, ret); // Either x
  515. regtail(ret, regnode(BRANCH)); // or
  516. next = regnode(NOTHING);// null.
  517. regtail(ret, next);
  518. regoptail(ret, next);
  519. }
  520. regparse++;
  521. if (ISMULT(*regparse)) {
  522. //RAISE Error, SYM(cmRegularExpression), SYM(Nested_Operand),
  523. printf ("cmRegularExpression::compile(): Nested *?+.\n");
  524. return 0;
  525. }
  526. return (ret);
  527. }
  528. /*
  529. - regatom - the lowest level
  530. *
  531. * Optimization: gobbles an entire sequence of ordinary characters so that
  532. * it can turn them into a single node, which is smaller to store and
  533. * faster to run. Backslashed characters are exceptions, each becoming a
  534. * separate node; the code is simpler that way and it's not worth fixing.
  535. */
  536. static char* regatom (int *flagp) {
  537. register char* ret;
  538. int flags;
  539. *flagp = WORST; // Tentatively.
  540. switch (*regparse++) {
  541. case '^':
  542. ret = regnode(BOL);
  543. break;
  544. case '$':
  545. ret = regnode(EOL);
  546. break;
  547. case '.':
  548. ret = regnode(ANY);
  549. *flagp |= HASWIDTH | SIMPLE;
  550. break;
  551. case '[':{
  552. register int rxpclass;
  553. register int rxpclassend;
  554. if (*regparse == '^') { // Complement of range.
  555. ret = regnode(ANYBUT);
  556. regparse++;
  557. }
  558. else
  559. ret = regnode(ANYOF);
  560. if (*regparse == ']' || *regparse == '-')
  561. regc(*regparse++);
  562. while (*regparse != '\0' && *regparse != ']') {
  563. if (*regparse == '-') {
  564. regparse++;
  565. if (*regparse == ']' || *regparse == '\0')
  566. regc('-');
  567. else {
  568. rxpclass = UCHARAT(regparse - 2) + 1;
  569. rxpclassend = UCHARAT(regparse);
  570. if (rxpclass > rxpclassend + 1) {
  571. //RAISE Error, SYM(cmRegularExpression), SYM(Invalid_Range),
  572. printf ("cmRegularExpression::compile(): Invalid range in [].\n");
  573. return 0;
  574. }
  575. for (; rxpclass <= rxpclassend; rxpclass++)
  576. regc(rxpclass);
  577. regparse++;
  578. }
  579. }
  580. else
  581. regc(*regparse++);
  582. }
  583. regc('\0');
  584. if (*regparse != ']') {
  585. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Bracket),
  586. printf ("cmRegularExpression::compile(): Unmatched [].\n");
  587. return 0;
  588. }
  589. regparse++;
  590. *flagp |= HASWIDTH | SIMPLE;
  591. }
  592. break;
  593. case '(':
  594. ret = reg(1, &flags);
  595. if (ret == NULL)
  596. return (NULL);
  597. *flagp |= flags & (HASWIDTH | SPSTART);
  598. break;
  599. case '\0':
  600. case '|':
  601. case ')':
  602. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  603. printf ("cmRegularExpression::compile(): Internal error.\n"); // Never here
  604. return 0;
  605. case '?':
  606. case '+':
  607. case '*':
  608. //RAISE Error, SYM(cmRegularExpression), SYM(No_Operand),
  609. printf ("cmRegularExpression::compile(): ?+* follows nothing.\n");
  610. return 0;
  611. case '\\':
  612. if (*regparse == '\0') {
  613. //RAISE Error, SYM(cmRegularExpression), SYM(Trailing_Backslash),
  614. printf ("cmRegularExpression::compile(): Trailing backslash.\n");
  615. return 0;
  616. }
  617. ret = regnode(EXACTLY);
  618. regc(*regparse++);
  619. regc('\0');
  620. *flagp |= HASWIDTH | SIMPLE;
  621. break;
  622. default:{
  623. register int len;
  624. register char ender;
  625. regparse--;
  626. len = strcspn(regparse, META);
  627. if (len <= 0) {
  628. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  629. printf ("cmRegularExpression::compile(): Internal error.\n");
  630. return 0;
  631. }
  632. ender = *(regparse + len);
  633. if (len > 1 && ISMULT(ender))
  634. len--; // Back off clear of ?+* operand.
  635. *flagp |= HASWIDTH;
  636. if (len == 1)
  637. *flagp |= SIMPLE;
  638. ret = regnode(EXACTLY);
  639. while (len > 0) {
  640. regc(*regparse++);
  641. len--;
  642. }
  643. regc('\0');
  644. }
  645. break;
  646. }
  647. return (ret);
  648. }
  649. /*
  650. - regnode - emit a node
  651. Location.
  652. */
  653. static char* regnode (char op) {
  654. register char* ret;
  655. register char* ptr;
  656. ret = regcode;
  657. if (ret == &regdummy) {
  658. regsize += 3;
  659. return (ret);
  660. }
  661. ptr = ret;
  662. *ptr++ = op;
  663. *ptr++ = '\0'; // Null "next" pointer.
  664. *ptr++ = '\0';
  665. regcode = ptr;
  666. return (ret);
  667. }
  668. /*
  669. - regc - emit (if appropriate) a byte of code
  670. */
  671. static void regc (unsigned char b) {
  672. if (regcode != &regdummy)
  673. *regcode++ = b;
  674. else
  675. regsize++;
  676. }
  677. /*
  678. - reginsert - insert an operator in front of already-emitted operand
  679. *
  680. * Means relocating the operand.
  681. */
  682. static void reginsert (char op, char* opnd) {
  683. register char* src;
  684. register char* dst;
  685. register char* place;
  686. if (regcode == &regdummy) {
  687. regsize += 3;
  688. return;
  689. }
  690. src = regcode;
  691. regcode += 3;
  692. dst = regcode;
  693. while (src > opnd)
  694. *--dst = *--src;
  695. place = opnd; // Op node, where operand used to be.
  696. *place++ = op;
  697. *place++ = '\0';
  698. *place++ = '\0';
  699. }
  700. /*
  701. - regtail - set the next-pointer at the end of a node chain
  702. */
  703. static void regtail (char* p, const char* val) {
  704. register char* scan;
  705. register char* temp;
  706. register int offset;
  707. if (p == &regdummy)
  708. return;
  709. // Find last node.
  710. scan = p;
  711. for (;;) {
  712. temp = regnext(scan);
  713. if (temp == NULL)
  714. break;
  715. scan = temp;
  716. }
  717. if (OP(scan) == BACK)
  718. offset = (const char*)scan - val;
  719. else
  720. offset = val - scan;
  721. *(scan + 1) = (offset >> 8) & 0377;
  722. *(scan + 2) = offset & 0377;
  723. }
  724. /*
  725. - regoptail - regtail on operand of first argument; nop if operandless
  726. */
  727. static void regoptail (char* p, const char* val) {
  728. // "Operandless" and "op != BRANCH" are synonymous in practice.
  729. if (p == NULL || p == &regdummy || OP(p) != BRANCH)
  730. return;
  731. regtail(OPERAND(p), val);
  732. }
  733. ////////////////////////////////////////////////////////////////////////
  734. //
  735. // find and friends
  736. //
  737. ////////////////////////////////////////////////////////////////////////
  738. /*
  739. * Global work variables for find().
  740. */
  741. static const char* reginput; // String-input pointer.
  742. static const char* regbol; // Beginning of input, for ^ check.
  743. static const char* *regstartp; // Pointer to startp array.
  744. static const char* *regendp; // Ditto for endp.
  745. /*
  746. * Forwards.
  747. */
  748. static int regtry (const char*, const char* *,
  749. const char* *, const char*);
  750. static int regmatch (const char*);
  751. static int regrepeat (const char*);
  752. #ifdef DEBUG
  753. int regnarrate = 0;
  754. void regdump ();
  755. static char* regprop ();
  756. #endif
  757. bool cmRegularExpression::find (std::string const& s)
  758. {
  759. return find(s.c_str());
  760. }
  761. // find -- Matches the regular expression to the given string.
  762. // Returns true if found, and sets start and end indexes accordingly.
  763. bool cmRegularExpression::find (const char* string) {
  764. register const char* s;
  765. this->searchstring = string;
  766. // Check validity of program.
  767. if (!this->program || UCHARAT(this->program) != MAGIC) {
  768. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  769. printf ("cmRegularExpression::find(): Compiled regular expression corrupted.\n");
  770. return 0;
  771. }
  772. // If there is a "must appear" string, look for it.
  773. if (this->regmust != NULL) {
  774. s = string;
  775. while ((s = strchr(s, this->regmust[0])) != NULL) {
  776. if (strncmp(s, this->regmust, this->regmlen) == 0)
  777. break; // Found it.
  778. s++;
  779. }
  780. if (s == NULL) // Not present.
  781. return (0);
  782. }
  783. // Mark beginning of line for ^ .
  784. regbol = string;
  785. // Simplest case: anchored match need be tried only once.
  786. if (this->reganch)
  787. return (regtry(string, this->startp, this->endp, this->program) != 0);
  788. // Messy cases: unanchored match.
  789. s = string;
  790. if (this->regstart != '\0')
  791. // We know what char it must start with.
  792. while ((s = strchr(s, this->regstart)) != NULL) {
  793. if (regtry(s, this->startp, this->endp, this->program))
  794. return (1);
  795. s++;
  796. }
  797. else
  798. // We don't -- general case.
  799. do {
  800. if (regtry(s, this->startp, this->endp, this->program))
  801. return (1);
  802. } while (*s++ != '\0');
  803. // Failure.
  804. return (0);
  805. }
  806. /*
  807. - regtry - try match at specific point
  808. 0 failure, 1 success
  809. */
  810. static int regtry (const char* string, const char* *start,
  811. const char* *end, const char* prog) {
  812. register int i;
  813. register const char* *sp1;
  814. register const char* *ep;
  815. reginput = string;
  816. regstartp = start;
  817. regendp = end;
  818. sp1 = start;
  819. ep = end;
  820. for (i = NSUBEXP; i > 0; i--) {
  821. *sp1++ = NULL;
  822. *ep++ = NULL;
  823. }
  824. if (regmatch(prog + 1)) {
  825. start[0] = string;
  826. end[0] = reginput;
  827. return (1);
  828. }
  829. else
  830. return (0);
  831. }
  832. /*
  833. - regmatch - main matching routine
  834. *
  835. * Conceptually the strategy is simple: check to see whether the current
  836. * node matches, call self recursively to see whether the rest matches,
  837. * and then act accordingly. In practice we make some effort to avoid
  838. * recursion, in particular by going through "ordinary" nodes (that don't
  839. * need to know whether the rest of the match failed) by a loop instead of
  840. * by recursion.
  841. * 0 failure, 1 success
  842. */
  843. static int regmatch (const char* prog) {
  844. register const char* scan; // Current node.
  845. const char* next; // Next node.
  846. scan = prog;
  847. while (scan != NULL) {
  848. next = regnext(scan);
  849. switch (OP(scan)) {
  850. case BOL:
  851. if (reginput != regbol)
  852. return (0);
  853. break;
  854. case EOL:
  855. if (*reginput != '\0')
  856. return (0);
  857. break;
  858. case ANY:
  859. if (*reginput == '\0')
  860. return (0);
  861. reginput++;
  862. break;
  863. case EXACTLY:{
  864. register int len;
  865. register const char* opnd;
  866. opnd = OPERAND(scan);
  867. // Inline the first character, for speed.
  868. if (*opnd != *reginput)
  869. return (0);
  870. len = strlen(opnd);
  871. if (len > 1 && strncmp(opnd, reginput, len) != 0)
  872. return (0);
  873. reginput += len;
  874. }
  875. break;
  876. case ANYOF:
  877. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
  878. return (0);
  879. reginput++;
  880. break;
  881. case ANYBUT:
  882. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
  883. return (0);
  884. reginput++;
  885. break;
  886. case NOTHING:
  887. break;
  888. case BACK:
  889. break;
  890. case OPEN + 1:
  891. case OPEN + 2:
  892. case OPEN + 3:
  893. case OPEN + 4:
  894. case OPEN + 5:
  895. case OPEN + 6:
  896. case OPEN + 7:
  897. case OPEN + 8:
  898. case OPEN + 9:{
  899. register int no;
  900. register const char* save;
  901. no = OP(scan) - OPEN;
  902. save = reginput;
  903. if (regmatch(next)) {
  904. //
  905. // Don't set startp if some later invocation of the
  906. // same parentheses already has.
  907. //
  908. if (regstartp[no] == NULL)
  909. regstartp[no] = save;
  910. return (1);
  911. }
  912. else
  913. return (0);
  914. }
  915. // break;
  916. case CLOSE + 1:
  917. case CLOSE + 2:
  918. case CLOSE + 3:
  919. case CLOSE + 4:
  920. case CLOSE + 5:
  921. case CLOSE + 6:
  922. case CLOSE + 7:
  923. case CLOSE + 8:
  924. case CLOSE + 9:{
  925. register int no;
  926. register const char* save;
  927. no = OP(scan) - CLOSE;
  928. save = reginput;
  929. if (regmatch(next)) {
  930. //
  931. // Don't set endp if some later invocation of the
  932. // same parentheses already has.
  933. //
  934. if (regendp[no] == NULL)
  935. regendp[no] = save;
  936. return (1);
  937. }
  938. else
  939. return (0);
  940. }
  941. // break;
  942. case BRANCH:{
  943. register const char* save;
  944. if (OP(next) != BRANCH) // No choice.
  945. next = OPERAND(scan); // Avoid recursion.
  946. else {
  947. do {
  948. save = reginput;
  949. if (regmatch(OPERAND(scan)))
  950. return (1);
  951. reginput = save;
  952. scan = regnext(scan);
  953. } while (scan != NULL && OP(scan) == BRANCH);
  954. return (0);
  955. // NOTREACHED
  956. }
  957. }
  958. break;
  959. case STAR:
  960. case PLUS:{
  961. register char nextch;
  962. register int no;
  963. register const char* save;
  964. register int min_no;
  965. //
  966. // Lookahead to avoid useless match attempts when we know
  967. // what character comes next.
  968. //
  969. nextch = '\0';
  970. if (OP(next) == EXACTLY)
  971. nextch = *OPERAND(next);
  972. min_no = (OP(scan) == STAR) ? 0 : 1;
  973. save = reginput;
  974. no = regrepeat(OPERAND(scan));
  975. while (no >= min_no) {
  976. // If it could work, try it.
  977. if (nextch == '\0' || *reginput == nextch)
  978. if (regmatch(next))
  979. return (1);
  980. // Couldn't or didn't -- back up.
  981. no--;
  982. reginput = save + no;
  983. }
  984. return (0);
  985. }
  986. // break;
  987. case END:
  988. return (1); // Success!
  989. default:
  990. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  991. printf ("cmRegularExpression::find(): Internal error -- memory corrupted.\n");
  992. return 0;
  993. }
  994. scan = next;
  995. }
  996. //
  997. // We get here only if there's trouble -- normally "case END" is the
  998. // terminating point.
  999. //
  1000. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  1001. printf ("cmRegularExpression::find(): Internal error -- corrupted pointers.\n");
  1002. return (0);
  1003. }
  1004. /*
  1005. - regrepeat - repeatedly match something simple, report how many
  1006. */
  1007. static int regrepeat (const char* p) {
  1008. register int count = 0;
  1009. register const char* scan;
  1010. register const char* opnd;
  1011. scan = reginput;
  1012. opnd = OPERAND(p);
  1013. switch (OP(p)) {
  1014. case ANY:
  1015. count = strlen(scan);
  1016. scan += count;
  1017. break;
  1018. case EXACTLY:
  1019. while (*opnd == *scan) {
  1020. count++;
  1021. scan++;
  1022. }
  1023. break;
  1024. case ANYOF:
  1025. while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
  1026. count++;
  1027. scan++;
  1028. }
  1029. break;
  1030. case ANYBUT:
  1031. while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
  1032. count++;
  1033. scan++;
  1034. }
  1035. break;
  1036. default: // Oh dear. Called inappropriately.
  1037. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  1038. printf ("cm RegularExpression::find(): Internal error.\n");
  1039. return 0;
  1040. }
  1041. reginput = scan;
  1042. return (count);
  1043. }
  1044. /*
  1045. - regnext - dig the "next" pointer out of a node
  1046. */
  1047. static const char* regnext (register const char* p) {
  1048. register int offset;
  1049. if (p == &regdummy)
  1050. return (NULL);
  1051. offset = NEXT(p);
  1052. if (offset == 0)
  1053. return (NULL);
  1054. if (OP(p) == BACK)
  1055. return (p - offset);
  1056. else
  1057. return (p + offset);
  1058. }
  1059. static char* regnext (register char* p) {
  1060. register int offset;
  1061. if (p == &regdummy)
  1062. return (NULL);
  1063. offset = NEXT(p);
  1064. if (offset == 0)
  1065. return (NULL);
  1066. if (OP(p) == BACK)
  1067. return (p - offset);
  1068. else
  1069. return (p + offset);
  1070. }