cmRegularExpression.cxx 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196
  1. /*=========================================================================
  2. Program: Insight Segmentation & Registration Toolkit
  3. Module: $RCSfile$
  4. Language: C++
  5. Date: $Date$
  6. Version: $Revision$
  7. Copyright (c) 2002 Insight Consortium. All rights reserved.
  8. See ITKCopyright.txt or http://www.itk.org/HTML/Copyright.htm for details.
  9. This software is distributed WITHOUT ANY WARRANTY; without even
  10. the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  11. PURPOSE. See the above copyright notices for more information.
  12. =========================================================================*/
  13. //
  14. // Copyright (C) 1991 Texas Instruments Incorporated.
  15. //
  16. // Permission is granted to any individual or institution to use, copy, modify,
  17. // and distribute this software, provided that this complete copyright and
  18. // permission notice is maintained, intact, in all copies and supporting
  19. // documentation.
  20. //
  21. // Texas Instruments Incorporated provides this software "as is" without
  22. // express or implied warranty.
  23. //
  24. //
  25. // Created: MNF 06/13/89 Initial Design and Implementation
  26. // Updated: LGO 08/09/89 Inherit from Generic
  27. // Updated: MBN 09/07/89 Added conditional exception handling
  28. // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
  29. // Updated: DLS 03/22/91 New lite version
  30. //
  31. #include "cmRegularExpression.h" // Include class specification
  32. #include "cmStandardIncludes.h"
  33. #include <stdio.h>
  34. // cmRegularExpression -- Copies the given regular expression.
  35. cmRegularExpression::cmRegularExpression (const cmRegularExpression& rxp) {
  36. int ind;
  37. this->progsize = rxp.progsize; // Copy regular expression size
  38. this->program = new char[this->progsize]; // Allocate storage
  39. for(ind=this->progsize; ind-- != 0;) // Copy regular expresion
  40. this->program[ind] = rxp.program[ind];
  41. this->startp[0] = rxp.startp[0]; // Copy pointers into last
  42. this->endp[0] = rxp.endp[0]; // Successful "find" operation
  43. this->regmust = rxp.regmust; // Copy field
  44. if (rxp.regmust != NULL) {
  45. char* dum = rxp.program;
  46. ind = 0;
  47. while (dum != rxp.regmust) {
  48. ++dum;
  49. ++ind;
  50. }
  51. this->regmust = this->program + ind;
  52. }
  53. this->regstart = rxp.regstart; // Copy starting index
  54. this->reganch = rxp.reganch; // Copy remaining private data
  55. this->regmlen = rxp.regmlen; // Copy remaining private data
  56. }
  57. // operator== -- Returns true if two regular expressions have the same
  58. // compiled program for pattern matching.
  59. bool cmRegularExpression::operator== (const cmRegularExpression& rxp) const {
  60. if (this != &rxp) { // Same address?
  61. int ind = this->progsize; // Get regular expression size
  62. if (ind != rxp.progsize) // If different size regexp
  63. return false; // Return failure
  64. while(ind-- != 0) // Else while still characters
  65. if(this->program[ind] != rxp.program[ind]) // If regexp are different
  66. return false; // Return failure
  67. }
  68. return true; // Else same, return success
  69. }
  70. // deep_equal -- Returns true if have the same compiled regular expressions
  71. // and the same start and end pointers.
  72. bool cmRegularExpression::deep_equal (const cmRegularExpression& rxp) const {
  73. int ind = this->progsize; // Get regular expression size
  74. if (ind != rxp.progsize) // If different size regexp
  75. return false; // Return failure
  76. while(ind-- != 0) // Else while still characters
  77. if(this->program[ind] != rxp.program[ind]) // If regexp are different
  78. return false; // Return failure
  79. return (this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs,
  80. this->endp[0] == rxp.endp[0]); // Return true
  81. }
  82. // The remaining code in this file is derived from the regular expression code
  83. // whose copyright statement appears below. It has been changed to work
  84. // with the class concepts of C++ and COOL.
  85. /*
  86. * compile and find
  87. *
  88. * Copyright (c) 1986 by University of Toronto.
  89. * Written by Henry Spencer. Not derived from licensed software.
  90. *
  91. * Permission is granted to anyone to use this software for any
  92. * purpose on any computer system, and to redistribute it freely,
  93. * subject to the following restrictions:
  94. *
  95. * 1. The author is not responsible for the consequences of use of
  96. * this software, no matter how awful, even if they arise
  97. * from defects in it.
  98. *
  99. * 2. The origin of this software must not be misrepresented, either
  100. * by explicit claim or by omission.
  101. *
  102. * 3. Altered versions must be plainly marked as such, and must not
  103. * be misrepresented as being the original software.
  104. *
  105. * Beware that some of this code is subtly aware of the way operator
  106. * precedence is structured in regular expressions. Serious changes in
  107. * regular-expression syntax might require a total rethink.
  108. */
  109. /*
  110. * The "internal use only" fields in regexp.h are present to pass info from
  111. * compile to execute that permits the execute phase to run lots faster on
  112. * simple cases. They are:
  113. *
  114. * regstart char that must begin a match; '\0' if none obvious
  115. * reganch is the match anchored (at beginning-of-line only)?
  116. * regmust string (pointer into program) that match must include, or NULL
  117. * regmlen length of regmust string
  118. *
  119. * Regstart and reganch permit very fast decisions on suitable starting points
  120. * for a match, cutting down the work a lot. Regmust permits fast rejection
  121. * of lines that cannot possibly match. The regmust tests are costly enough
  122. * that compile() supplies a regmust only if the r.e. contains something
  123. * potentially expensive (at present, the only such thing detected is * or +
  124. * at the start of the r.e., which can involve a lot of backup). Regmlen is
  125. * supplied because the test in find() needs it and compile() is computing
  126. * it anyway.
  127. */
  128. /*
  129. * Structure for regexp "program". This is essentially a linear encoding
  130. * of a nondeterministic finite-state machine (aka syntax charts or
  131. * "railroad normal form" in parsing technology). Each node is an opcode
  132. * plus a "next" pointer, possibly plus an operand. "Next" pointers of
  133. * all nodes except BRANCH implement concatenation; a "next" pointer with
  134. * a BRANCH on both ends of it is connecting two alternatives. (Here we
  135. * have one of the subtle syntax dependencies: an individual BRANCH (as
  136. * opposed to a collection of them) is never concatenated with anything
  137. * because of operator precedence.) The operand of some types of node is
  138. * a literal string; for others, it is a node leading into a sub-FSM. In
  139. * particular, the operand of a BRANCH node is the first node of the branch.
  140. * (NB this is *not* a tree structure: the tail of the branch connects
  141. * to the thing following the set of BRANCHes.) The opcodes are:
  142. */
  143. // definition number opnd? meaning
  144. #define END 0 // no End of program.
  145. #define BOL 1 // no Match "" at beginning of line.
  146. #define EOL 2 // no Match "" at end of line.
  147. #define ANY 3 // no Match any one character.
  148. #define ANYOF 4 // str Match any character in this string.
  149. #define ANYBUT 5 // str Match any character not in this
  150. // string.
  151. #define BRANCH 6 // node Match this alternative, or the
  152. // next...
  153. #define BACK 7 // no Match "", "next" ptr points backward.
  154. #define EXACTLY 8 // str Match this string.
  155. #define NOTHING 9 // no Match empty string.
  156. #define STAR 10 // node Match this (simple) thing 0 or more
  157. // times.
  158. #define PLUS 11 // node Match this (simple) thing 1 or more
  159. // times.
  160. #define OPEN 20 // no Mark this point in input as start of
  161. // #n.
  162. // OPEN+1 is number 1, etc.
  163. #define CLOSE 30 // no Analogous to OPEN.
  164. /*
  165. * Opcode notes:
  166. *
  167. * BRANCH The set of branches constituting a single choice are hooked
  168. * together with their "next" pointers, since precedence prevents
  169. * anything being concatenated to any individual branch. The
  170. * "next" pointer of the last BRANCH in a choice points to the
  171. * thing following the whole choice. This is also where the
  172. * final "next" pointer of each individual branch points; each
  173. * branch starts with the operand node of a BRANCH node.
  174. *
  175. * BACK Normal "next" pointers all implicitly point forward; BACK
  176. * exists to make loop structures possible.
  177. *
  178. * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
  179. * BRANCH structures using BACK. Simple cases (one character
  180. * per match) are implemented with STAR and PLUS for speed
  181. * and to minimize recursive plunges.
  182. *
  183. * OPEN,CLOSE ...are numbered at compile time.
  184. */
  185. /*
  186. * A node is one char of opcode followed by two chars of "next" pointer.
  187. * "Next" pointers are stored as two 8-bit pieces, high order first. The
  188. * value is a positive offset from the opcode of the node containing it.
  189. * An operand, if any, simply follows the node. (Note that much of the
  190. * code generation knows about this implicit relationship.)
  191. *
  192. * Using two bytes for the "next" pointer is vast overkill for most things,
  193. * but allows patterns to get big without disasters.
  194. */
  195. #define OP(p) (*(p))
  196. #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
  197. #define OPERAND(p) ((p) + 3)
  198. const unsigned char MAGIC = 0234;
  199. /*
  200. * Utility definitions.
  201. */
  202. #define UCHARAT(p) ((const unsigned char*)(p))[0]
  203. #define FAIL(m) { regerror(m); return(NULL); }
  204. #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
  205. #define META "^$.[()|?+*\\"
  206. /*
  207. * Flags to be passed up and down.
  208. */
  209. #define HASWIDTH 01 // Known never to match null string.
  210. #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
  211. #define SPSTART 04 // Starts with * or +.
  212. #define WORST 0 // Worst case.
  213. /////////////////////////////////////////////////////////////////////////
  214. //
  215. // COMPILE AND ASSOCIATED FUNCTIONS
  216. //
  217. /////////////////////////////////////////////////////////////////////////
  218. /*
  219. * Global work variables for compile().
  220. */
  221. static const char* regparse; // Input-scan pointer.
  222. static int regnpar; // () count.
  223. static char regdummy;
  224. static char* regcode; // Code-emit pointer; &regdummy = don't.
  225. static long regsize; // Code size.
  226. /*
  227. * Forward declarations for compile()'s friends.
  228. */
  229. // #ifndef static
  230. // #define static static
  231. // #endif
  232. static char* reg (int, int*);
  233. static char* regbranch (int*);
  234. static char* regpiece (int*);
  235. static char* regatom (int*);
  236. static char* regnode (char);
  237. static const char* regnext (register const char*);
  238. static char* regnext (register char*);
  239. static void regc (unsigned char);
  240. static void reginsert (char, char*);
  241. static void regtail (char*, const char*);
  242. static void regoptail (char*, const char*);
  243. #ifdef STRCSPN
  244. static int strcspn ();
  245. #endif
  246. /*
  247. * We can't allocate space until we know how big the compiled form will be,
  248. * but we can't compile it (and thus know how big it is) until we've got a
  249. * place to put the code. So we cheat: we compile it twice, once with code
  250. * generation turned off and size counting turned on, and once "for real".
  251. * This also means that we don't allocate space until we are sure that the
  252. * thing really will compile successfully, and we never have to move the
  253. * code and thus invalidate pointers into it. (Note that it has to be in
  254. * one piece because free() must be able to free it all.)
  255. *
  256. * Beware that the optimization-preparation code in here knows about some
  257. * of the structure of the compiled regexp.
  258. */
  259. // compile -- compile a regular expression into internal code
  260. // for later pattern matching.
  261. void cmRegularExpression::compile (const char* exp) {
  262. register const char* scan;
  263. register const char* longest;
  264. register unsigned long len;
  265. int flags;
  266. if (exp == NULL) {
  267. //RAISE Error, SYM(cmRegularExpression), SYM(No_Expr),
  268. printf ("cmRegularExpression::compile(): No expression supplied.\n");
  269. return;
  270. }
  271. // First pass: determine size, legality.
  272. regparse = exp;
  273. regnpar = 1;
  274. regsize = 0L;
  275. regcode = &regdummy;
  276. regc(MAGIC);
  277. if(!reg(0, &flags))
  278. {
  279. printf ("cmRegularExpression::compile(): Error in compile.\n");
  280. return;
  281. }
  282. this->startp[0] = this->endp[0] = this->searchstring = NULL;
  283. // Small enough for pointer-storage convention?
  284. if (regsize >= 32767L) { // Probably could be 65535L.
  285. //RAISE Error, SYM(cmRegularExpression), SYM(Expr_Too_Big),
  286. printf ("cmRegularExpression::compile(): Expression too big.\n");
  287. return;
  288. }
  289. // Allocate space.
  290. //#ifndef WIN32
  291. if (this->program != NULL) delete [] this->program;
  292. //#endif
  293. this->program = new char[regsize];
  294. this->progsize = (int) regsize;
  295. if (this->program == NULL) {
  296. //RAISE Error, SYM(cmRegularExpression), SYM(Out_Of_Memory),
  297. printf ("cmRegularExpression::compile(): Out of memory.\n");
  298. return;
  299. }
  300. // Second pass: emit code.
  301. regparse = exp;
  302. regnpar = 1;
  303. regcode = this->program;
  304. regc(MAGIC);
  305. reg(0, &flags);
  306. // Dig out information for optimizations.
  307. this->regstart = '\0'; // Worst-case defaults.
  308. this->reganch = 0;
  309. this->regmust = NULL;
  310. this->regmlen = 0;
  311. scan = this->program + 1; // First BRANCH.
  312. if (OP(regnext(scan)) == END) { // Only one top-level choice.
  313. scan = OPERAND(scan);
  314. // Starting-point info.
  315. if (OP(scan) == EXACTLY)
  316. this->regstart = *OPERAND(scan);
  317. else if (OP(scan) == BOL)
  318. this->reganch++;
  319. //
  320. // If there's something expensive in the r.e., find the longest
  321. // literal string that must appear and make it the regmust. Resolve
  322. // ties in favor of later strings, since the regstart check works
  323. // with the beginning of the r.e. and avoiding duplication
  324. // strengthens checking. Not a strong reason, but sufficient in the
  325. // absence of others.
  326. //
  327. if (flags & SPSTART) {
  328. longest = NULL;
  329. len = 0;
  330. for (; scan != NULL; scan = regnext(scan))
  331. if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
  332. longest = OPERAND(scan);
  333. len = strlen(OPERAND(scan));
  334. }
  335. this->regmust = longest;
  336. this->regmlen = len;
  337. }
  338. }
  339. }
  340. /*
  341. - reg - regular expression, i.e. main body or parenthesized thing
  342. *
  343. * Caller must absorb opening parenthesis.
  344. *
  345. * Combining parenthesis handling with the base level of regular expression
  346. * is a trifle forced, but the need to tie the tails of the branches to what
  347. * follows makes it hard to avoid.
  348. */
  349. static char* reg (int paren, int *flagp) {
  350. register char* ret;
  351. register char* br;
  352. register char* ender;
  353. register int parno =0;
  354. int flags;
  355. *flagp = HASWIDTH; // Tentatively.
  356. // Make an OPEN node, if parenthesized.
  357. if (paren) {
  358. if (regnpar >= NSUBEXP) {
  359. //RAISE Error, SYM(cmRegularExpression), SYM(Too_Many_Parens),
  360. printf ("cmRegularExpression::compile(): Too many parentheses.\n");
  361. return 0;
  362. }
  363. parno = regnpar;
  364. regnpar++;
  365. ret = regnode(OPEN + parno);
  366. }
  367. else
  368. ret = NULL;
  369. // Pick up the branches, linking them together.
  370. br = regbranch(&flags);
  371. if (br == NULL)
  372. return (NULL);
  373. if (ret != NULL)
  374. regtail(ret, br); // OPEN -> first.
  375. else
  376. ret = br;
  377. if (!(flags & HASWIDTH))
  378. *flagp &= ~HASWIDTH;
  379. *flagp |= flags & SPSTART;
  380. while (*regparse == '|') {
  381. regparse++;
  382. br = regbranch(&flags);
  383. if (br == NULL)
  384. return (NULL);
  385. regtail(ret, br); // BRANCH -> BRANCH.
  386. if (!(flags & HASWIDTH))
  387. *flagp &= ~HASWIDTH;
  388. *flagp |= flags & SPSTART;
  389. }
  390. // Make a closing node, and hook it on the end.
  391. ender = regnode((paren) ? CLOSE + parno : END);
  392. regtail(ret, ender);
  393. // Hook the tails of the branches to the closing node.
  394. for (br = ret; br != NULL; br = regnext(br))
  395. regoptail(br, ender);
  396. // Check for proper termination.
  397. if (paren && *regparse++ != ')') {
  398. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
  399. printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
  400. return 0;
  401. }
  402. else if (!paren && *regparse != '\0') {
  403. if (*regparse == ')') {
  404. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
  405. printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
  406. return 0;
  407. }
  408. else {
  409. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  410. printf ("cmRegularExpression::compile(): Internal error.\n");
  411. return 0;
  412. }
  413. // NOTREACHED
  414. }
  415. return (ret);
  416. }
  417. /*
  418. - regbranch - one alternative of an | operator
  419. *
  420. * Implements the concatenation operator.
  421. */
  422. static char* regbranch (int *flagp) {
  423. register char* ret;
  424. register char* chain;
  425. register char* latest;
  426. int flags;
  427. *flagp = WORST; // Tentatively.
  428. ret = regnode(BRANCH);
  429. chain = NULL;
  430. while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
  431. latest = regpiece(&flags);
  432. if (latest == NULL)
  433. return (NULL);
  434. *flagp |= flags & HASWIDTH;
  435. if (chain == NULL) // First piece.
  436. *flagp |= flags & SPSTART;
  437. else
  438. regtail(chain, latest);
  439. chain = latest;
  440. }
  441. if (chain == NULL) // Loop ran zero times.
  442. regnode(NOTHING);
  443. return (ret);
  444. }
  445. /*
  446. - regpiece - something followed by possible [*+?]
  447. *
  448. * Note that the branching code sequences used for ? and the general cases
  449. * of * and + are somewhat optimized: they use the same NOTHING node as
  450. * both the endmarker for their branch list and the body of the last branch.
  451. * It might seem that this node could be dispensed with entirely, but the
  452. * endmarker role is not redundant.
  453. */
  454. static char* regpiece (int *flagp) {
  455. register char* ret;
  456. register char op;
  457. register char* next;
  458. int flags;
  459. ret = regatom(&flags);
  460. if (ret == NULL)
  461. return (NULL);
  462. op = *regparse;
  463. if (!ISMULT(op)) {
  464. *flagp = flags;
  465. return (ret);
  466. }
  467. if (!(flags & HASWIDTH) && op != '?') {
  468. //RAISE Error, SYM(cmRegularExpression), SYM(Empty_Operand),
  469. printf ("cmRegularExpression::compile() : *+ operand could be empty.\n");
  470. return 0;
  471. }
  472. *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
  473. if (op == '*' && (flags & SIMPLE))
  474. reginsert(STAR, ret);
  475. else if (op == '*') {
  476. // Emit x* as (x&|), where & means "self".
  477. reginsert(BRANCH, ret); // Either x
  478. regoptail(ret, regnode(BACK)); // and loop
  479. regoptail(ret, ret); // back
  480. regtail(ret, regnode(BRANCH)); // or
  481. regtail(ret, regnode(NOTHING)); // null.
  482. }
  483. else if (op == '+' && (flags & SIMPLE))
  484. reginsert(PLUS, ret);
  485. else if (op == '+') {
  486. // Emit x+ as x(&|), where & means "self".
  487. next = regnode(BRANCH); // Either
  488. regtail(ret, next);
  489. regtail(regnode(BACK), ret); // loop back
  490. regtail(next, regnode(BRANCH)); // or
  491. regtail(ret, regnode(NOTHING)); // null.
  492. }
  493. else if (op == '?') {
  494. // Emit x? as (x|)
  495. reginsert(BRANCH, ret); // Either x
  496. regtail(ret, regnode(BRANCH)); // or
  497. next = regnode(NOTHING);// null.
  498. regtail(ret, next);
  499. regoptail(ret, next);
  500. }
  501. regparse++;
  502. if (ISMULT(*regparse)) {
  503. //RAISE Error, SYM(cmRegularExpression), SYM(Nested_Operand),
  504. printf ("cmRegularExpression::compile(): Nested *?+.\n");
  505. return 0;
  506. }
  507. return (ret);
  508. }
  509. /*
  510. - regatom - the lowest level
  511. *
  512. * Optimization: gobbles an entire sequence of ordinary characters so that
  513. * it can turn them into a single node, which is smaller to store and
  514. * faster to run. Backslashed characters are exceptions, each becoming a
  515. * separate node; the code is simpler that way and it's not worth fixing.
  516. */
  517. static char* regatom (int *flagp) {
  518. register char* ret;
  519. int flags;
  520. *flagp = WORST; // Tentatively.
  521. switch (*regparse++) {
  522. case '^':
  523. ret = regnode(BOL);
  524. break;
  525. case '$':
  526. ret = regnode(EOL);
  527. break;
  528. case '.':
  529. ret = regnode(ANY);
  530. *flagp |= HASWIDTH | SIMPLE;
  531. break;
  532. case '[':{
  533. register int rxpclass;
  534. register int rxpclassend;
  535. if (*regparse == '^') { // Complement of range.
  536. ret = regnode(ANYBUT);
  537. regparse++;
  538. }
  539. else
  540. ret = regnode(ANYOF);
  541. if (*regparse == ']' || *regparse == '-')
  542. regc(*regparse++);
  543. while (*regparse != '\0' && *regparse != ']') {
  544. if (*regparse == '-') {
  545. regparse++;
  546. if (*regparse == ']' || *regparse == '\0')
  547. regc('-');
  548. else {
  549. rxpclass = UCHARAT(regparse - 2) + 1;
  550. rxpclassend = UCHARAT(regparse);
  551. if (rxpclass > rxpclassend + 1) {
  552. //RAISE Error, SYM(cmRegularExpression), SYM(Invalid_Range),
  553. printf ("cmRegularExpression::compile(): Invalid range in [].\n");
  554. return 0;
  555. }
  556. for (; rxpclass <= rxpclassend; rxpclass++)
  557. regc(rxpclass);
  558. regparse++;
  559. }
  560. }
  561. else
  562. regc(*regparse++);
  563. }
  564. regc('\0');
  565. if (*regparse != ']') {
  566. //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Bracket),
  567. printf ("cmRegularExpression::compile(): Unmatched [].\n");
  568. return 0;
  569. }
  570. regparse++;
  571. *flagp |= HASWIDTH | SIMPLE;
  572. }
  573. break;
  574. case '(':
  575. ret = reg(1, &flags);
  576. if (ret == NULL)
  577. return (NULL);
  578. *flagp |= flags & (HASWIDTH | SPSTART);
  579. break;
  580. case '\0':
  581. case '|':
  582. case ')':
  583. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  584. printf ("cmRegularExpression::compile(): Internal error.\n"); // Never here
  585. return 0;
  586. case '?':
  587. case '+':
  588. case '*':
  589. //RAISE Error, SYM(cmRegularExpression), SYM(No_Operand),
  590. printf ("cmRegularExpression::compile(): ?+* follows nothing.\n");
  591. return 0;
  592. case '\\':
  593. if (*regparse == '\0') {
  594. //RAISE Error, SYM(cmRegularExpression), SYM(Trailing_Backslash),
  595. printf ("cmRegularExpression::compile(): Trailing backslash.\n");
  596. return 0;
  597. }
  598. ret = regnode(EXACTLY);
  599. regc(*regparse++);
  600. regc('\0');
  601. *flagp |= HASWIDTH | SIMPLE;
  602. break;
  603. default:{
  604. register int len;
  605. register char ender;
  606. regparse--;
  607. len = strcspn(regparse, META);
  608. if (len <= 0) {
  609. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  610. printf ("cmRegularExpression::compile(): Internal error.\n");
  611. return 0;
  612. }
  613. ender = *(regparse + len);
  614. if (len > 1 && ISMULT(ender))
  615. len--; // Back off clear of ?+* operand.
  616. *flagp |= HASWIDTH;
  617. if (len == 1)
  618. *flagp |= SIMPLE;
  619. ret = regnode(EXACTLY);
  620. while (len > 0) {
  621. regc(*regparse++);
  622. len--;
  623. }
  624. regc('\0');
  625. }
  626. break;
  627. }
  628. return (ret);
  629. }
  630. /*
  631. - regnode - emit a node
  632. Location.
  633. */
  634. static char* regnode (char op) {
  635. register char* ret;
  636. register char* ptr;
  637. ret = regcode;
  638. if (ret == &regdummy) {
  639. regsize += 3;
  640. return (ret);
  641. }
  642. ptr = ret;
  643. *ptr++ = op;
  644. *ptr++ = '\0'; // Null "next" pointer.
  645. *ptr++ = '\0';
  646. regcode = ptr;
  647. return (ret);
  648. }
  649. /*
  650. - regc - emit (if appropriate) a byte of code
  651. */
  652. static void regc (unsigned char b) {
  653. if (regcode != &regdummy)
  654. *regcode++ = b;
  655. else
  656. regsize++;
  657. }
  658. /*
  659. - reginsert - insert an operator in front of already-emitted operand
  660. *
  661. * Means relocating the operand.
  662. */
  663. static void reginsert (char op, char* opnd) {
  664. register char* src;
  665. register char* dst;
  666. register char* place;
  667. if (regcode == &regdummy) {
  668. regsize += 3;
  669. return;
  670. }
  671. src = regcode;
  672. regcode += 3;
  673. dst = regcode;
  674. while (src > opnd)
  675. *--dst = *--src;
  676. place = opnd; // Op node, where operand used to be.
  677. *place++ = op;
  678. *place++ = '\0';
  679. *place++ = '\0';
  680. }
  681. /*
  682. - regtail - set the next-pointer at the end of a node chain
  683. */
  684. static void regtail (char* p, const char* val) {
  685. register char* scan;
  686. register char* temp;
  687. register int offset;
  688. if (p == &regdummy)
  689. return;
  690. // Find last node.
  691. scan = p;
  692. for (;;) {
  693. temp = regnext(scan);
  694. if (temp == NULL)
  695. break;
  696. scan = temp;
  697. }
  698. if (OP(scan) == BACK)
  699. offset = (const char*)scan - val;
  700. else
  701. offset = val - scan;
  702. *(scan + 1) = (offset >> 8) & 0377;
  703. *(scan + 2) = offset & 0377;
  704. }
  705. /*
  706. - regoptail - regtail on operand of first argument; nop if operandless
  707. */
  708. static void regoptail (char* p, const char* val) {
  709. // "Operandless" and "op != BRANCH" are synonymous in practice.
  710. if (p == NULL || p == &regdummy || OP(p) != BRANCH)
  711. return;
  712. regtail(OPERAND(p), val);
  713. }
  714. ////////////////////////////////////////////////////////////////////////
  715. //
  716. // find and friends
  717. //
  718. ////////////////////////////////////////////////////////////////////////
  719. /*
  720. * Global work variables for find().
  721. */
  722. static const char* reginput; // String-input pointer.
  723. static const char* regbol; // Beginning of input, for ^ check.
  724. static const char* *regstartp; // Pointer to startp array.
  725. static const char* *regendp; // Ditto for endp.
  726. /*
  727. * Forwards.
  728. */
  729. static int regtry (const char*, const char* *,
  730. const char* *, const char*);
  731. static int regmatch (const char*);
  732. static int regrepeat (const char*);
  733. #ifdef DEBUG
  734. int regnarrate = 0;
  735. void regdump ();
  736. static char* regprop ();
  737. #endif
  738. bool cmRegularExpression::find (std::string const& s)
  739. {
  740. return find(s.c_str());
  741. }
  742. // find -- Matches the regular expression to the given string.
  743. // Returns true if found, and sets start and end indexes accordingly.
  744. bool cmRegularExpression::find (const char* string) {
  745. register const char* s;
  746. this->searchstring = string;
  747. // Check validity of program.
  748. if (!this->program || UCHARAT(this->program) != MAGIC) {
  749. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  750. printf ("cmRegularExpression::find(): Compiled regular expression corrupted.\n");
  751. return 0;
  752. }
  753. // If there is a "must appear" string, look for it.
  754. if (this->regmust != NULL) {
  755. s = string;
  756. while ((s = strchr(s, this->regmust[0])) != NULL) {
  757. if (strncmp(s, this->regmust, this->regmlen) == 0)
  758. break; // Found it.
  759. s++;
  760. }
  761. if (s == NULL) // Not present.
  762. return (0);
  763. }
  764. // Mark beginning of line for ^ .
  765. regbol = string;
  766. // Simplest case: anchored match need be tried only once.
  767. if (this->reganch)
  768. return (regtry(string, this->startp, this->endp, this->program) != 0);
  769. // Messy cases: unanchored match.
  770. s = string;
  771. if (this->regstart != '\0')
  772. // We know what char it must start with.
  773. while ((s = strchr(s, this->regstart)) != NULL) {
  774. if (regtry(s, this->startp, this->endp, this->program))
  775. return (1);
  776. s++;
  777. }
  778. else
  779. // We don't -- general case.
  780. do {
  781. if (regtry(s, this->startp, this->endp, this->program))
  782. return (1);
  783. } while (*s++ != '\0');
  784. // Failure.
  785. return (0);
  786. }
  787. /*
  788. - regtry - try match at specific point
  789. 0 failure, 1 success
  790. */
  791. static int regtry (const char* string, const char* *start,
  792. const char* *end, const char* prog) {
  793. register int i;
  794. register const char* *sp1;
  795. register const char* *ep;
  796. reginput = string;
  797. regstartp = start;
  798. regendp = end;
  799. sp1 = start;
  800. ep = end;
  801. for (i = NSUBEXP; i > 0; i--) {
  802. *sp1++ = NULL;
  803. *ep++ = NULL;
  804. }
  805. if (regmatch(prog + 1)) {
  806. start[0] = string;
  807. end[0] = reginput;
  808. return (1);
  809. }
  810. else
  811. return (0);
  812. }
  813. /*
  814. - regmatch - main matching routine
  815. *
  816. * Conceptually the strategy is simple: check to see whether the current
  817. * node matches, call self recursively to see whether the rest matches,
  818. * and then act accordingly. In practice we make some effort to avoid
  819. * recursion, in particular by going through "ordinary" nodes (that don't
  820. * need to know whether the rest of the match failed) by a loop instead of
  821. * by recursion.
  822. * 0 failure, 1 success
  823. */
  824. static int regmatch (const char* prog) {
  825. register const char* scan; // Current node.
  826. const char* next; // Next node.
  827. scan = prog;
  828. while (scan != NULL) {
  829. next = regnext(scan);
  830. switch (OP(scan)) {
  831. case BOL:
  832. if (reginput != regbol)
  833. return (0);
  834. break;
  835. case EOL:
  836. if (*reginput != '\0')
  837. return (0);
  838. break;
  839. case ANY:
  840. if (*reginput == '\0')
  841. return (0);
  842. reginput++;
  843. break;
  844. case EXACTLY:{
  845. register int len;
  846. register const char* opnd;
  847. opnd = OPERAND(scan);
  848. // Inline the first character, for speed.
  849. if (*opnd != *reginput)
  850. return (0);
  851. len = strlen(opnd);
  852. if (len > 1 && strncmp(opnd, reginput, len) != 0)
  853. return (0);
  854. reginput += len;
  855. }
  856. break;
  857. case ANYOF:
  858. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
  859. return (0);
  860. reginput++;
  861. break;
  862. case ANYBUT:
  863. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
  864. return (0);
  865. reginput++;
  866. break;
  867. case NOTHING:
  868. break;
  869. case BACK:
  870. break;
  871. case OPEN + 1:
  872. case OPEN + 2:
  873. case OPEN + 3:
  874. case OPEN + 4:
  875. case OPEN + 5:
  876. case OPEN + 6:
  877. case OPEN + 7:
  878. case OPEN + 8:
  879. case OPEN + 9:{
  880. register int no;
  881. register const char* save;
  882. no = OP(scan) - OPEN;
  883. save = reginput;
  884. if (regmatch(next)) {
  885. //
  886. // Don't set startp if some later invocation of the
  887. // same parentheses already has.
  888. //
  889. if (regstartp[no] == NULL)
  890. regstartp[no] = save;
  891. return (1);
  892. }
  893. else
  894. return (0);
  895. }
  896. // break;
  897. case CLOSE + 1:
  898. case CLOSE + 2:
  899. case CLOSE + 3:
  900. case CLOSE + 4:
  901. case CLOSE + 5:
  902. case CLOSE + 6:
  903. case CLOSE + 7:
  904. case CLOSE + 8:
  905. case CLOSE + 9:{
  906. register int no;
  907. register const char* save;
  908. no = OP(scan) - CLOSE;
  909. save = reginput;
  910. if (regmatch(next)) {
  911. //
  912. // Don't set endp if some later invocation of the
  913. // same parentheses already has.
  914. //
  915. if (regendp[no] == NULL)
  916. regendp[no] = save;
  917. return (1);
  918. }
  919. else
  920. return (0);
  921. }
  922. // break;
  923. case BRANCH:{
  924. register const char* save;
  925. if (OP(next) != BRANCH) // No choice.
  926. next = OPERAND(scan); // Avoid recursion.
  927. else {
  928. do {
  929. save = reginput;
  930. if (regmatch(OPERAND(scan)))
  931. return (1);
  932. reginput = save;
  933. scan = regnext(scan);
  934. } while (scan != NULL && OP(scan) == BRANCH);
  935. return (0);
  936. // NOTREACHED
  937. }
  938. }
  939. break;
  940. case STAR:
  941. case PLUS:{
  942. register char nextch;
  943. register int no;
  944. register const char* save;
  945. register int min_no;
  946. //
  947. // Lookahead to avoid useless match attempts when we know
  948. // what character comes next.
  949. //
  950. nextch = '\0';
  951. if (OP(next) == EXACTLY)
  952. nextch = *OPERAND(next);
  953. min_no = (OP(scan) == STAR) ? 0 : 1;
  954. save = reginput;
  955. no = regrepeat(OPERAND(scan));
  956. while (no >= min_no) {
  957. // If it could work, try it.
  958. if (nextch == '\0' || *reginput == nextch)
  959. if (regmatch(next))
  960. return (1);
  961. // Couldn't or didn't -- back up.
  962. no--;
  963. reginput = save + no;
  964. }
  965. return (0);
  966. }
  967. // break;
  968. case END:
  969. return (1); // Success!
  970. default:
  971. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  972. printf ("cmRegularExpression::find(): Internal error -- memory corrupted.\n");
  973. return 0;
  974. }
  975. scan = next;
  976. }
  977. //
  978. // We get here only if there's trouble -- normally "case END" is the
  979. // terminating point.
  980. //
  981. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  982. printf ("cmRegularExpression::find(): Internal error -- corrupted pointers.\n");
  983. return (0);
  984. }
  985. /*
  986. - regrepeat - repeatedly match something simple, report how many
  987. */
  988. static int regrepeat (const char* p) {
  989. register int count = 0;
  990. register const char* scan;
  991. register const char* opnd;
  992. scan = reginput;
  993. opnd = OPERAND(p);
  994. switch (OP(p)) {
  995. case ANY:
  996. count = strlen(scan);
  997. scan += count;
  998. break;
  999. case EXACTLY:
  1000. while (*opnd == *scan) {
  1001. count++;
  1002. scan++;
  1003. }
  1004. break;
  1005. case ANYOF:
  1006. while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
  1007. count++;
  1008. scan++;
  1009. }
  1010. break;
  1011. case ANYBUT:
  1012. while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
  1013. count++;
  1014. scan++;
  1015. }
  1016. break;
  1017. default: // Oh dear. Called inappropriately.
  1018. //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
  1019. printf ("cm RegularExpression::find(): Internal error.\n");
  1020. return 0;
  1021. }
  1022. reginput = scan;
  1023. return (count);
  1024. }
  1025. /*
  1026. - regnext - dig the "next" pointer out of a node
  1027. */
  1028. static const char* regnext (register const char* p) {
  1029. register int offset;
  1030. if (p == &regdummy)
  1031. return (NULL);
  1032. offset = NEXT(p);
  1033. if (offset == 0)
  1034. return (NULL);
  1035. if (OP(p) == BACK)
  1036. return (p - offset);
  1037. else
  1038. return (p + offset);
  1039. }
  1040. static char* regnext (register char* p) {
  1041. register int offset;
  1042. if (p == &regdummy)
  1043. return (NULL);
  1044. offset = NEXT(p);
  1045. if (offset == 0)
  1046. return (NULL);
  1047. if (OP(p) == BACK)
  1048. return (p - offset);
  1049. else
  1050. return (p + offset);
  1051. }