RegularExpression.cxx 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209
  1. /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
  2. file Copyright.txt or https://cmake.org/licensing#kwsys for details. */
  3. //
  4. // Copyright (C) 1991 Texas Instruments Incorporated.
  5. //
  6. // Permission is granted to any individual or institution to use, copy, modify
  7. // and distribute this software, provided that this complete copyright and
  8. // permission notice is maintained, intact, in all copies and supporting
  9. // documentation.
  10. //
  11. // Texas Instruments Incorporated provides this software "as is" without
  12. // express or implied warranty.
  13. //
  14. //
  15. // Created: MNF 06/13/89 Initial Design and Implementation
  16. // Updated: LGO 08/09/89 Inherit from Generic
  17. // Updated: MBN 09/07/89 Added conditional exception handling
  18. // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
  19. // Updated: DLS 03/22/91 New lite version
  20. //
  21. #include "kwsysPrivate.h"
  22. #include KWSYS_HEADER(RegularExpression.hxx)
  23. // Work-around CMake dependency scanning limitation. This must
  24. // duplicate the above list of headers.
  25. #if 0
  26. #include "RegularExpression.hxx.in"
  27. #endif
  28. #include <stdio.h>
  29. #include <string.h>
  30. namespace KWSYS_NAMESPACE {
  31. // RegularExpression -- Copies the given regular expression.
  32. RegularExpression::RegularExpression(const RegularExpression& rxp)
  33. {
  34. if (!rxp.program) {
  35. this->program = 0;
  36. return;
  37. }
  38. int ind;
  39. this->progsize = rxp.progsize; // Copy regular expression size
  40. this->program = new char[this->progsize]; // Allocate storage
  41. for (ind = this->progsize; ind-- != 0;) // Copy regular expresion
  42. this->program[ind] = rxp.program[ind];
  43. this->startp[0] = rxp.startp[0]; // Copy pointers into last
  44. this->endp[0] = rxp.endp[0]; // Successful "find" operation
  45. this->regmust = rxp.regmust; // Copy field
  46. if (rxp.regmust != 0) {
  47. char* dum = rxp.program;
  48. ind = 0;
  49. while (dum != rxp.regmust) {
  50. ++dum;
  51. ++ind;
  52. }
  53. this->regmust = this->program + ind;
  54. }
  55. this->regstart = rxp.regstart; // Copy starting index
  56. this->reganch = rxp.reganch; // Copy remaining private data
  57. this->regmlen = rxp.regmlen; // Copy remaining private data
  58. }
  59. // operator= -- Copies the given regular expression.
  60. RegularExpression& RegularExpression::operator=(const RegularExpression& rxp)
  61. {
  62. if (this == &rxp) {
  63. return *this;
  64. }
  65. if (!rxp.program) {
  66. this->program = 0;
  67. return *this;
  68. }
  69. int ind;
  70. this->progsize = rxp.progsize; // Copy regular expression size
  71. delete[] this->program;
  72. this->program = new char[this->progsize]; // Allocate storage
  73. for (ind = this->progsize; ind-- != 0;) // Copy regular expresion
  74. this->program[ind] = rxp.program[ind];
  75. this->startp[0] = rxp.startp[0]; // Copy pointers into last
  76. this->endp[0] = rxp.endp[0]; // Successful "find" operation
  77. this->regmust = rxp.regmust; // Copy field
  78. if (rxp.regmust != 0) {
  79. char* dum = rxp.program;
  80. ind = 0;
  81. while (dum != rxp.regmust) {
  82. ++dum;
  83. ++ind;
  84. }
  85. this->regmust = this->program + ind;
  86. }
  87. this->regstart = rxp.regstart; // Copy starting index
  88. this->reganch = rxp.reganch; // Copy remaining private data
  89. this->regmlen = rxp.regmlen; // Copy remaining private data
  90. return *this;
  91. }
  92. // operator== -- Returns true if two regular expressions have the same
  93. // compiled program for pattern matching.
  94. bool RegularExpression::operator==(const RegularExpression& rxp) const
  95. {
  96. if (this != &rxp) { // Same address?
  97. int ind = this->progsize; // Get regular expression size
  98. if (ind != rxp.progsize) // If different size regexp
  99. return false; // Return failure
  100. while (ind-- != 0) // Else while still characters
  101. if (this->program[ind] != rxp.program[ind]) // If regexp are different
  102. return false; // Return failure
  103. }
  104. return true; // Else same, return success
  105. }
  106. // deep_equal -- Returns true if have the same compiled regular expressions
  107. // and the same start and end pointers.
  108. bool RegularExpression::deep_equal(const RegularExpression& rxp) const
  109. {
  110. int ind = this->progsize; // Get regular expression size
  111. if (ind != rxp.progsize) // If different size regexp
  112. return false; // Return failure
  113. while (ind-- != 0) // Else while still characters
  114. if (this->program[ind] != rxp.program[ind]) // If regexp are different
  115. return false; // Return failure
  116. return (this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs,
  117. this->endp[0] == rxp.endp[0]); // Return true
  118. }
  119. // The remaining code in this file is derived from the regular expression code
  120. // whose copyright statement appears below. It has been changed to work
  121. // with the class concepts of C++ and COOL.
  122. /*
  123. * compile and find
  124. *
  125. * Copyright (c) 1986 by University of Toronto.
  126. * Written by Henry Spencer. Not derived from licensed software.
  127. *
  128. * Permission is granted to anyone to use this software for any
  129. * purpose on any computer system, and to redistribute it freely,
  130. * subject to the following restrictions:
  131. *
  132. * 1. The author is not responsible for the consequences of use of
  133. * this software, no matter how awful, even if they arise
  134. * from defects in it.
  135. *
  136. * 2. The origin of this software must not be misrepresented, either
  137. * by explicit claim or by omission.
  138. *
  139. * 3. Altered versions must be plainly marked as such, and must not
  140. * be misrepresented as being the original software.
  141. *
  142. * Beware that some of this code is subtly aware of the way operator
  143. * precedence is structured in regular expressions. Serious changes in
  144. * regular-expression syntax might require a total rethink.
  145. */
  146. /*
  147. * The "internal use only" fields in regexp.h are present to pass info from
  148. * compile to execute that permits the execute phase to run lots faster on
  149. * simple cases. They are:
  150. *
  151. * regstart char that must begin a match; '\0' if none obvious
  152. * reganch is the match anchored (at beginning-of-line only)?
  153. * regmust string (pointer into program) that match must include, or NULL
  154. * regmlen length of regmust string
  155. *
  156. * Regstart and reganch permit very fast decisions on suitable starting points
  157. * for a match, cutting down the work a lot. Regmust permits fast rejection
  158. * of lines that cannot possibly match. The regmust tests are costly enough
  159. * that compile() supplies a regmust only if the r.e. contains something
  160. * potentially expensive (at present, the only such thing detected is * or +
  161. * at the start of the r.e., which can involve a lot of backup). Regmlen is
  162. * supplied because the test in find() needs it and compile() is computing
  163. * it anyway.
  164. */
  165. /*
  166. * Structure for regexp "program". This is essentially a linear encoding
  167. * of a nondeterministic finite-state machine (aka syntax charts or
  168. * "railroad normal form" in parsing technology). Each node is an opcode
  169. * plus a "next" pointer, possibly plus an operand. "Next" pointers of
  170. * all nodes except BRANCH implement concatenation; a "next" pointer with
  171. * a BRANCH on both ends of it is connecting two alternatives. (Here we
  172. * have one of the subtle syntax dependencies: an individual BRANCH (as
  173. * opposed to a collection of them) is never concatenated with anything
  174. * because of operator precedence.) The operand of some types of node is
  175. * a literal string; for others, it is a node leading into a sub-FSM. In
  176. * particular, the operand of a BRANCH node is the first node of the branch.
  177. * (NB this is *not* a tree structure: the tail of the branch connects
  178. * to the thing following the set of BRANCHes.) The opcodes are:
  179. */
  180. // definition number opnd? meaning
  181. #define END 0 // no End of program.
  182. #define BOL 1 // no Match "" at beginning of line.
  183. #define EOL 2 // no Match "" at end of line.
  184. #define ANY 3 // no Match any one character.
  185. #define ANYOF 4 // str Match any character in this string.
  186. #define ANYBUT 5 // str Match any character not in this
  187. // string.
  188. #define BRANCH 6 // node Match this alternative, or the
  189. // next...
  190. #define BACK 7 // no Match "", "next" ptr points backward.
  191. #define EXACTLY 8 // str Match this string.
  192. #define NOTHING 9 // no Match empty string.
  193. #define STAR 10 // node Match this (simple) thing 0 or more
  194. // times.
  195. #define PLUS 11 // node Match this (simple) thing 1 or more
  196. // times.
  197. #define OPEN 20 // no Mark this point in input as start of
  198. // #n.
  199. // OPEN+1 is number 1, etc.
  200. #define CLOSE 30 // no Analogous to OPEN.
  201. /*
  202. * Opcode notes:
  203. *
  204. * BRANCH The set of branches constituting a single choice are hooked
  205. * together with their "next" pointers, since precedence prevents
  206. * anything being concatenated to any individual branch. The
  207. * "next" pointer of the last BRANCH in a choice points to the
  208. * thing following the whole choice. This is also where the
  209. * final "next" pointer of each individual branch points; each
  210. * branch starts with the operand node of a BRANCH node.
  211. *
  212. * BACK Normal "next" pointers all implicitly point forward; BACK
  213. * exists to make loop structures possible.
  214. *
  215. * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
  216. * BRANCH structures using BACK. Simple cases (one character
  217. * per match) are implemented with STAR and PLUS for speed
  218. * and to minimize recursive plunges.
  219. *
  220. * OPEN,CLOSE ...are numbered at compile time.
  221. */
  222. /*
  223. * A node is one char of opcode followed by two chars of "next" pointer.
  224. * "Next" pointers are stored as two 8-bit pieces, high order first. The
  225. * value is a positive offset from the opcode of the node containing it.
  226. * An operand, if any, simply follows the node. (Note that much of the
  227. * code generation knows about this implicit relationship.)
  228. *
  229. * Using two bytes for the "next" pointer is vast overkill for most things,
  230. * but allows patterns to get big without disasters.
  231. */
  232. #define OP(p) (*(p))
  233. #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
  234. #define OPERAND(p) ((p) + 3)
  235. const unsigned char MAGIC = 0234;
  236. /*
  237. * Utility definitions.
  238. */
  239. #define UCHARAT(p) (reinterpret_cast<const unsigned char*>(p))[0]
  240. #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
  241. #define META "^$.[()|?+*\\"
  242. /*
  243. * Flags to be passed up and down.
  244. */
  245. #define HASWIDTH 01 // Known never to match null string.
  246. #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
  247. #define SPSTART 04 // Starts with * or +.
  248. #define WORST 0 // Worst case.
  249. /////////////////////////////////////////////////////////////////////////
  250. //
  251. // COMPILE AND ASSOCIATED FUNCTIONS
  252. //
  253. /////////////////////////////////////////////////////////////////////////
  254. /*
  255. * Global work variables for compile().
  256. */
  257. static const char* regparse; // Input-scan pointer.
  258. static int regnpar; // () count.
  259. static char regdummy;
  260. static char* regcode; // Code-emit pointer; &regdummy = don't.
  261. static long regsize; // Code size.
  262. /*
  263. * Forward declarations for compile()'s friends.
  264. */
  265. // #ifndef static
  266. // #define static static
  267. // #endif
  268. static char* reg(int, int*);
  269. static char* regbranch(int*);
  270. static char* regpiece(int*);
  271. static char* regatom(int*);
  272. static char* regnode(char);
  273. static const char* regnext(const char*);
  274. static char* regnext(char*);
  275. static void regc(char);
  276. static void reginsert(char, char*);
  277. static void regtail(char*, const char*);
  278. static void regoptail(char*, const char*);
  279. #ifdef STRCSPN
  280. static int strcspn();
  281. #endif
  282. /*
  283. * We can't allocate space until we know how big the compiled form will be,
  284. * but we can't compile it (and thus know how big it is) until we've got a
  285. * place to put the code. So we cheat: we compile it twice, once with code
  286. * generation turned off and size counting turned on, and once "for real".
  287. * This also means that we don't allocate space until we are sure that the
  288. * thing really will compile successfully, and we never have to move the
  289. * code and thus invalidate pointers into it. (Note that it has to be in
  290. * one piece because free() must be able to free it all.)
  291. *
  292. * Beware that the optimization-preparation code in here knows about some
  293. * of the structure of the compiled regexp.
  294. */
  295. // compile -- compile a regular expression into internal code
  296. // for later pattern matching.
  297. bool RegularExpression::compile(const char* exp)
  298. {
  299. const char* scan;
  300. const char* longest;
  301. size_t len;
  302. int flags;
  303. if (exp == 0) {
  304. // RAISE Error, SYM(RegularExpression), SYM(No_Expr),
  305. printf("RegularExpression::compile(): No expression supplied.\n");
  306. return false;
  307. }
  308. // First pass: determine size, legality.
  309. regparse = exp;
  310. regnpar = 1;
  311. regsize = 0L;
  312. regcode = &regdummy;
  313. regc(static_cast<char>(MAGIC));
  314. if (!reg(0, &flags)) {
  315. printf("RegularExpression::compile(): Error in compile.\n");
  316. return false;
  317. }
  318. this->startp[0] = this->endp[0] = this->searchstring = 0;
  319. // Small enough for pointer-storage convention?
  320. if (regsize >= 32767L) { // Probably could be 65535L.
  321. // RAISE Error, SYM(RegularExpression), SYM(Expr_Too_Big),
  322. printf("RegularExpression::compile(): Expression too big.\n");
  323. return false;
  324. }
  325. // Allocate space.
  326. //#ifndef _WIN32
  327. if (this->program != 0)
  328. delete[] this->program;
  329. //#endif
  330. this->program = new char[regsize];
  331. this->progsize = static_cast<int>(regsize);
  332. if (this->program == 0) {
  333. // RAISE Error, SYM(RegularExpression), SYM(Out_Of_Memory),
  334. printf("RegularExpression::compile(): Out of memory.\n");
  335. return false;
  336. }
  337. // Second pass: emit code.
  338. regparse = exp;
  339. regnpar = 1;
  340. regcode = this->program;
  341. regc(static_cast<char>(MAGIC));
  342. reg(0, &flags);
  343. // Dig out information for optimizations.
  344. this->regstart = '\0'; // Worst-case defaults.
  345. this->reganch = 0;
  346. this->regmust = 0;
  347. this->regmlen = 0;
  348. scan = this->program + 1; // First BRANCH.
  349. if (OP(regnext(scan)) == END) { // Only one top-level choice.
  350. scan = OPERAND(scan);
  351. // Starting-point info.
  352. if (OP(scan) == EXACTLY)
  353. this->regstart = *OPERAND(scan);
  354. else if (OP(scan) == BOL)
  355. this->reganch++;
  356. //
  357. // If there's something expensive in the r.e., find the longest
  358. // literal string that must appear and make it the regmust. Resolve
  359. // ties in favor of later strings, since the regstart check works
  360. // with the beginning of the r.e. and avoiding duplication
  361. // strengthens checking. Not a strong reason, but sufficient in the
  362. // absence of others.
  363. //
  364. if (flags & SPSTART) {
  365. longest = 0;
  366. len = 0;
  367. for (; scan != 0; scan = regnext(scan))
  368. if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
  369. longest = OPERAND(scan);
  370. len = strlen(OPERAND(scan));
  371. }
  372. this->regmust = longest;
  373. this->regmlen = len;
  374. }
  375. }
  376. return true;
  377. }
  378. /*
  379. - reg - regular expression, i.e. main body or parenthesized thing
  380. *
  381. * Caller must absorb opening parenthesis.
  382. *
  383. * Combining parenthesis handling with the base level of regular expression
  384. * is a trifle forced, but the need to tie the tails of the branches to what
  385. * follows makes it hard to avoid.
  386. */
  387. static char* reg(int paren, int* flagp)
  388. {
  389. char* ret;
  390. char* br;
  391. char* ender;
  392. int parno = 0;
  393. int flags;
  394. *flagp = HASWIDTH; // Tentatively.
  395. // Make an OPEN node, if parenthesized.
  396. if (paren) {
  397. if (regnpar >= RegularExpression::NSUBEXP) {
  398. // RAISE Error, SYM(RegularExpression), SYM(Too_Many_Parens),
  399. printf("RegularExpression::compile(): Too many parentheses.\n");
  400. return 0;
  401. }
  402. parno = regnpar;
  403. regnpar++;
  404. ret = regnode(static_cast<char>(OPEN + parno));
  405. } else
  406. ret = 0;
  407. // Pick up the branches, linking them together.
  408. br = regbranch(&flags);
  409. if (br == 0)
  410. return (0);
  411. if (ret != 0)
  412. regtail(ret, br); // OPEN -> first.
  413. else
  414. ret = br;
  415. if (!(flags & HASWIDTH))
  416. *flagp &= ~HASWIDTH;
  417. *flagp |= flags & SPSTART;
  418. while (*regparse == '|') {
  419. regparse++;
  420. br = regbranch(&flags);
  421. if (br == 0)
  422. return (0);
  423. regtail(ret, br); // BRANCH -> BRANCH.
  424. if (!(flags & HASWIDTH))
  425. *flagp &= ~HASWIDTH;
  426. *flagp |= flags & SPSTART;
  427. }
  428. // Make a closing node, and hook it on the end.
  429. ender = regnode(static_cast<char>((paren) ? CLOSE + parno : END));
  430. regtail(ret, ender);
  431. // Hook the tails of the branches to the closing node.
  432. for (br = ret; br != 0; br = regnext(br))
  433. regoptail(br, ender);
  434. // Check for proper termination.
  435. if (paren && *regparse++ != ')') {
  436. // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
  437. printf("RegularExpression::compile(): Unmatched parentheses.\n");
  438. return 0;
  439. } else if (!paren && *regparse != '\0') {
  440. if (*regparse == ')') {
  441. // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
  442. printf("RegularExpression::compile(): Unmatched parentheses.\n");
  443. return 0;
  444. } else {
  445. // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
  446. printf("RegularExpression::compile(): Internal error.\n");
  447. return 0;
  448. }
  449. // NOTREACHED
  450. }
  451. return (ret);
  452. }
  453. /*
  454. - regbranch - one alternative of an | operator
  455. *
  456. * Implements the concatenation operator.
  457. */
  458. static char* regbranch(int* flagp)
  459. {
  460. char* ret;
  461. char* chain;
  462. char* latest;
  463. int flags;
  464. *flagp = WORST; // Tentatively.
  465. ret = regnode(BRANCH);
  466. chain = 0;
  467. while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
  468. latest = regpiece(&flags);
  469. if (latest == 0)
  470. return (0);
  471. *flagp |= flags & HASWIDTH;
  472. if (chain == 0) // First piece.
  473. *flagp |= flags & SPSTART;
  474. else
  475. regtail(chain, latest);
  476. chain = latest;
  477. }
  478. if (chain == 0) // Loop ran zero times.
  479. regnode(NOTHING);
  480. return (ret);
  481. }
  482. /*
  483. - regpiece - something followed by possible [*+?]
  484. *
  485. * Note that the branching code sequences used for ? and the general cases
  486. * of * and + are somewhat optimized: they use the same NOTHING node as
  487. * both the endmarker for their branch list and the body of the last branch.
  488. * It might seem that this node could be dispensed with entirely, but the
  489. * endmarker role is not redundant.
  490. */
  491. static char* regpiece(int* flagp)
  492. {
  493. char* ret;
  494. char op;
  495. char* next;
  496. int flags;
  497. ret = regatom(&flags);
  498. if (ret == 0)
  499. return (0);
  500. op = *regparse;
  501. if (!ISMULT(op)) {
  502. *flagp = flags;
  503. return (ret);
  504. }
  505. if (!(flags & HASWIDTH) && op != '?') {
  506. // RAISE Error, SYM(RegularExpression), SYM(Empty_Operand),
  507. printf("RegularExpression::compile() : *+ operand could be empty.\n");
  508. return 0;
  509. }
  510. *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
  511. if (op == '*' && (flags & SIMPLE))
  512. reginsert(STAR, ret);
  513. else if (op == '*') {
  514. // Emit x* as (x&|), where & means "self".
  515. reginsert(BRANCH, ret); // Either x
  516. regoptail(ret, regnode(BACK)); // and loop
  517. regoptail(ret, ret); // back
  518. regtail(ret, regnode(BRANCH)); // or
  519. regtail(ret, regnode(NOTHING)); // null.
  520. } else if (op == '+' && (flags & SIMPLE))
  521. reginsert(PLUS, ret);
  522. else if (op == '+') {
  523. // Emit x+ as x(&|), where & means "self".
  524. next = regnode(BRANCH); // Either
  525. regtail(ret, next);
  526. regtail(regnode(BACK), ret); // loop back
  527. regtail(next, regnode(BRANCH)); // or
  528. regtail(ret, regnode(NOTHING)); // null.
  529. } else if (op == '?') {
  530. // Emit x? as (x|)
  531. reginsert(BRANCH, ret); // Either x
  532. regtail(ret, regnode(BRANCH)); // or
  533. next = regnode(NOTHING); // null.
  534. regtail(ret, next);
  535. regoptail(ret, next);
  536. }
  537. regparse++;
  538. if (ISMULT(*regparse)) {
  539. // RAISE Error, SYM(RegularExpression), SYM(Nested_Operand),
  540. printf("RegularExpression::compile(): Nested *?+.\n");
  541. return 0;
  542. }
  543. return (ret);
  544. }
  545. /*
  546. - regatom - the lowest level
  547. *
  548. * Optimization: gobbles an entire sequence of ordinary characters so that
  549. * it can turn them into a single node, which is smaller to store and
  550. * faster to run. Backslashed characters are exceptions, each becoming a
  551. * separate node; the code is simpler that way and it's not worth fixing.
  552. */
  553. static char* regatom(int* flagp)
  554. {
  555. char* ret;
  556. int flags;
  557. *flagp = WORST; // Tentatively.
  558. switch (*regparse++) {
  559. case '^':
  560. ret = regnode(BOL);
  561. break;
  562. case '$':
  563. ret = regnode(EOL);
  564. break;
  565. case '.':
  566. ret = regnode(ANY);
  567. *flagp |= HASWIDTH | SIMPLE;
  568. break;
  569. case '[': {
  570. int rxpclass;
  571. int rxpclassend;
  572. if (*regparse == '^') { // Complement of range.
  573. ret = regnode(ANYBUT);
  574. regparse++;
  575. } else
  576. ret = regnode(ANYOF);
  577. if (*regparse == ']' || *regparse == '-')
  578. regc(*regparse++);
  579. while (*regparse != '\0' && *regparse != ']') {
  580. if (*regparse == '-') {
  581. regparse++;
  582. if (*regparse == ']' || *regparse == '\0')
  583. regc('-');
  584. else {
  585. rxpclass = UCHARAT(regparse - 2) + 1;
  586. rxpclassend = UCHARAT(regparse);
  587. if (rxpclass > rxpclassend + 1) {
  588. // RAISE Error, SYM(RegularExpression), SYM(Invalid_Range),
  589. printf("RegularExpression::compile(): Invalid range in [].\n");
  590. return 0;
  591. }
  592. for (; rxpclass <= rxpclassend; rxpclass++)
  593. regc(static_cast<char>(rxpclass));
  594. regparse++;
  595. }
  596. } else
  597. regc(*regparse++);
  598. }
  599. regc('\0');
  600. if (*regparse != ']') {
  601. // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Bracket),
  602. printf("RegularExpression::compile(): Unmatched [].\n");
  603. return 0;
  604. }
  605. regparse++;
  606. *flagp |= HASWIDTH | SIMPLE;
  607. } break;
  608. case '(':
  609. ret = reg(1, &flags);
  610. if (ret == 0)
  611. return (0);
  612. *flagp |= flags & (HASWIDTH | SPSTART);
  613. break;
  614. case '\0':
  615. case '|':
  616. case ')':
  617. // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
  618. printf("RegularExpression::compile(): Internal error.\n"); // Never here
  619. return 0;
  620. case '?':
  621. case '+':
  622. case '*':
  623. // RAISE Error, SYM(RegularExpression), SYM(No_Operand),
  624. printf("RegularExpression::compile(): ?+* follows nothing.\n");
  625. return 0;
  626. case '\\':
  627. if (*regparse == '\0') {
  628. // RAISE Error, SYM(RegularExpression), SYM(Trailing_Backslash),
  629. printf("RegularExpression::compile(): Trailing backslash.\n");
  630. return 0;
  631. }
  632. ret = regnode(EXACTLY);
  633. regc(*regparse++);
  634. regc('\0');
  635. *flagp |= HASWIDTH | SIMPLE;
  636. break;
  637. default: {
  638. int len;
  639. char ender;
  640. regparse--;
  641. len = int(strcspn(regparse, META));
  642. if (len <= 0) {
  643. // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
  644. printf("RegularExpression::compile(): Internal error.\n");
  645. return 0;
  646. }
  647. ender = *(regparse + len);
  648. if (len > 1 && ISMULT(ender))
  649. len--; // Back off clear of ?+* operand.
  650. *flagp |= HASWIDTH;
  651. if (len == 1)
  652. *flagp |= SIMPLE;
  653. ret = regnode(EXACTLY);
  654. while (len > 0) {
  655. regc(*regparse++);
  656. len--;
  657. }
  658. regc('\0');
  659. } break;
  660. }
  661. return (ret);
  662. }
  663. /*
  664. - regnode - emit a node
  665. Location.
  666. */
  667. static char* regnode(char op)
  668. {
  669. char* ret;
  670. char* ptr;
  671. ret = regcode;
  672. if (ret == &regdummy) {
  673. regsize += 3;
  674. return (ret);
  675. }
  676. ptr = ret;
  677. *ptr++ = op;
  678. *ptr++ = '\0'; // Null "next" pointer.
  679. *ptr++ = '\0';
  680. regcode = ptr;
  681. return (ret);
  682. }
  683. /*
  684. - regc - emit (if appropriate) a byte of code
  685. */
  686. static void regc(char b)
  687. {
  688. if (regcode != &regdummy)
  689. *regcode++ = b;
  690. else
  691. regsize++;
  692. }
  693. /*
  694. - reginsert - insert an operator in front of already-emitted operand
  695. *
  696. * Means relocating the operand.
  697. */
  698. static void reginsert(char op, char* opnd)
  699. {
  700. char* src;
  701. char* dst;
  702. char* place;
  703. if (regcode == &regdummy) {
  704. regsize += 3;
  705. return;
  706. }
  707. src = regcode;
  708. regcode += 3;
  709. dst = regcode;
  710. while (src > opnd)
  711. *--dst = *--src;
  712. place = opnd; // Op node, where operand used to be.
  713. *place++ = op;
  714. *place++ = '\0';
  715. *place = '\0';
  716. }
  717. /*
  718. - regtail - set the next-pointer at the end of a node chain
  719. */
  720. static void regtail(char* p, const char* val)
  721. {
  722. char* scan;
  723. char* temp;
  724. int offset;
  725. if (p == &regdummy)
  726. return;
  727. // Find last node.
  728. scan = p;
  729. for (;;) {
  730. temp = regnext(scan);
  731. if (temp == 0)
  732. break;
  733. scan = temp;
  734. }
  735. if (OP(scan) == BACK)
  736. offset = int(scan - val);
  737. else
  738. offset = int(val - scan);
  739. *(scan + 1) = static_cast<char>((offset >> 8) & 0377);
  740. *(scan + 2) = static_cast<char>(offset & 0377);
  741. }
  742. /*
  743. - regoptail - regtail on operand of first argument; nop if operandless
  744. */
  745. static void regoptail(char* p, const char* val)
  746. {
  747. // "Operandless" and "op != BRANCH" are synonymous in practice.
  748. if (p == 0 || p == &regdummy || OP(p) != BRANCH)
  749. return;
  750. regtail(OPERAND(p), val);
  751. }
  752. ////////////////////////////////////////////////////////////////////////
  753. //
  754. // find and friends
  755. //
  756. ////////////////////////////////////////////////////////////////////////
  757. /*
  758. * Global work variables for find().
  759. */
  760. static const char* reginput; // String-input pointer.
  761. static const char* regbol; // Beginning of input, for ^ check.
  762. static const char** regstartp; // Pointer to startp array.
  763. static const char** regendp; // Ditto for endp.
  764. /*
  765. * Forwards.
  766. */
  767. static int regtry(const char*, const char**, const char**, const char*);
  768. static int regmatch(const char*);
  769. static int regrepeat(const char*);
  770. #ifdef DEBUG
  771. int regnarrate = 0;
  772. void regdump();
  773. static char* regprop();
  774. #endif
  775. // find -- Matches the regular expression to the given string.
  776. // Returns true if found, and sets start and end indexes accordingly.
  777. bool RegularExpression::find(const char* string)
  778. {
  779. const char* s;
  780. this->searchstring = string;
  781. if (!this->program) {
  782. return false;
  783. }
  784. // Check validity of program.
  785. if (UCHARAT(this->program) != MAGIC) {
  786. // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
  787. printf(
  788. "RegularExpression::find(): Compiled regular expression corrupted.\n");
  789. return 0;
  790. }
  791. // If there is a "must appear" string, look for it.
  792. if (this->regmust != 0) {
  793. s = string;
  794. while ((s = strchr(s, this->regmust[0])) != 0) {
  795. if (strncmp(s, this->regmust, this->regmlen) == 0)
  796. break; // Found it.
  797. s++;
  798. }
  799. if (s == 0) // Not present.
  800. return (0);
  801. }
  802. // Mark beginning of line for ^ .
  803. regbol = string;
  804. // Simplest case: anchored match need be tried only once.
  805. if (this->reganch)
  806. return (regtry(string, this->startp, this->endp, this->program) != 0);
  807. // Messy cases: unanchored match.
  808. s = string;
  809. if (this->regstart != '\0')
  810. // We know what char it must start with.
  811. while ((s = strchr(s, this->regstart)) != 0) {
  812. if (regtry(s, this->startp, this->endp, this->program))
  813. return (1);
  814. s++;
  815. }
  816. else
  817. // We don't -- general case.
  818. do {
  819. if (regtry(s, this->startp, this->endp, this->program))
  820. return (1);
  821. } while (*s++ != '\0');
  822. // Failure.
  823. return (0);
  824. }
  825. /*
  826. - regtry - try match at specific point
  827. 0 failure, 1 success
  828. */
  829. static int regtry(const char* string, const char** start, const char** end,
  830. const char* prog)
  831. {
  832. int i;
  833. const char** sp1;
  834. const char** ep;
  835. reginput = string;
  836. regstartp = start;
  837. regendp = end;
  838. sp1 = start;
  839. ep = end;
  840. for (i = RegularExpression::NSUBEXP; i > 0; i--) {
  841. *sp1++ = 0;
  842. *ep++ = 0;
  843. }
  844. if (regmatch(prog + 1)) {
  845. start[0] = string;
  846. end[0] = reginput;
  847. return (1);
  848. } else
  849. return (0);
  850. }
  851. /*
  852. - regmatch - main matching routine
  853. *
  854. * Conceptually the strategy is simple: check to see whether the current
  855. * node matches, call self recursively to see whether the rest matches,
  856. * and then act accordingly. In practice we make some effort to avoid
  857. * recursion, in particular by going through "ordinary" nodes (that don't
  858. * need to know whether the rest of the match failed) by a loop instead of
  859. * by recursion.
  860. * 0 failure, 1 success
  861. */
  862. static int regmatch(const char* prog)
  863. {
  864. const char* scan; // Current node.
  865. const char* next; // Next node.
  866. scan = prog;
  867. while (scan != 0) {
  868. next = regnext(scan);
  869. switch (OP(scan)) {
  870. case BOL:
  871. if (reginput != regbol)
  872. return (0);
  873. break;
  874. case EOL:
  875. if (*reginput != '\0')
  876. return (0);
  877. break;
  878. case ANY:
  879. if (*reginput == '\0')
  880. return (0);
  881. reginput++;
  882. break;
  883. case EXACTLY: {
  884. size_t len;
  885. const char* opnd;
  886. opnd = OPERAND(scan);
  887. // Inline the first character, for speed.
  888. if (*opnd != *reginput)
  889. return (0);
  890. len = strlen(opnd);
  891. if (len > 1 && strncmp(opnd, reginput, len) != 0)
  892. return (0);
  893. reginput += len;
  894. } break;
  895. case ANYOF:
  896. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == 0)
  897. return (0);
  898. reginput++;
  899. break;
  900. case ANYBUT:
  901. if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != 0)
  902. return (0);
  903. reginput++;
  904. break;
  905. case NOTHING:
  906. break;
  907. case BACK:
  908. break;
  909. case OPEN + 1:
  910. case OPEN + 2:
  911. case OPEN + 3:
  912. case OPEN + 4:
  913. case OPEN + 5:
  914. case OPEN + 6:
  915. case OPEN + 7:
  916. case OPEN + 8:
  917. case OPEN + 9: {
  918. int no;
  919. const char* save;
  920. no = OP(scan) - OPEN;
  921. save = reginput;
  922. if (regmatch(next)) {
  923. //
  924. // Don't set startp if some later invocation of the
  925. // same parentheses already has.
  926. //
  927. if (regstartp[no] == 0)
  928. regstartp[no] = save;
  929. return (1);
  930. } else
  931. return (0);
  932. }
  933. // break;
  934. case CLOSE + 1:
  935. case CLOSE + 2:
  936. case CLOSE + 3:
  937. case CLOSE + 4:
  938. case CLOSE + 5:
  939. case CLOSE + 6:
  940. case CLOSE + 7:
  941. case CLOSE + 8:
  942. case CLOSE + 9: {
  943. int no;
  944. const char* save;
  945. no = OP(scan) - CLOSE;
  946. save = reginput;
  947. if (regmatch(next)) {
  948. //
  949. // Don't set endp if some later invocation of the
  950. // same parentheses already has.
  951. //
  952. if (regendp[no] == 0)
  953. regendp[no] = save;
  954. return (1);
  955. } else
  956. return (0);
  957. }
  958. // break;
  959. case BRANCH: {
  960. const char* save;
  961. if (OP(next) != BRANCH) // No choice.
  962. next = OPERAND(scan); // Avoid recursion.
  963. else {
  964. do {
  965. save = reginput;
  966. if (regmatch(OPERAND(scan)))
  967. return (1);
  968. reginput = save;
  969. scan = regnext(scan);
  970. } while (scan != 0 && OP(scan) == BRANCH);
  971. return (0);
  972. // NOTREACHED
  973. }
  974. } break;
  975. case STAR:
  976. case PLUS: {
  977. char nextch;
  978. int no;
  979. const char* save;
  980. int min_no;
  981. //
  982. // Lookahead to avoid useless match attempts when we know
  983. // what character comes next.
  984. //
  985. nextch = '\0';
  986. if (OP(next) == EXACTLY)
  987. nextch = *OPERAND(next);
  988. min_no = (OP(scan) == STAR) ? 0 : 1;
  989. save = reginput;
  990. no = regrepeat(OPERAND(scan));
  991. while (no >= min_no) {
  992. // If it could work, try it.
  993. if (nextch == '\0' || *reginput == nextch)
  994. if (regmatch(next))
  995. return (1);
  996. // Couldn't or didn't -- back up.
  997. no--;
  998. reginput = save + no;
  999. }
  1000. return (0);
  1001. }
  1002. // break;
  1003. case END:
  1004. return (1); // Success!
  1005. default:
  1006. // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
  1007. printf(
  1008. "RegularExpression::find(): Internal error -- memory corrupted.\n");
  1009. return 0;
  1010. }
  1011. scan = next;
  1012. }
  1013. //
  1014. // We get here only if there's trouble -- normally "case END" is the
  1015. // terminating point.
  1016. //
  1017. // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
  1018. printf("RegularExpression::find(): Internal error -- corrupted pointers.\n");
  1019. return (0);
  1020. }
  1021. /*
  1022. - regrepeat - repeatedly match something simple, report how many
  1023. */
  1024. static int regrepeat(const char* p)
  1025. {
  1026. int count = 0;
  1027. const char* scan;
  1028. const char* opnd;
  1029. scan = reginput;
  1030. opnd = OPERAND(p);
  1031. switch (OP(p)) {
  1032. case ANY:
  1033. count = int(strlen(scan));
  1034. scan += count;
  1035. break;
  1036. case EXACTLY:
  1037. while (*opnd == *scan) {
  1038. count++;
  1039. scan++;
  1040. }
  1041. break;
  1042. case ANYOF:
  1043. while (*scan != '\0' && strchr(opnd, *scan) != 0) {
  1044. count++;
  1045. scan++;
  1046. }
  1047. break;
  1048. case ANYBUT:
  1049. while (*scan != '\0' && strchr(opnd, *scan) == 0) {
  1050. count++;
  1051. scan++;
  1052. }
  1053. break;
  1054. default: // Oh dear. Called inappropriately.
  1055. // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
  1056. printf("cm RegularExpression::find(): Internal error.\n");
  1057. return 0;
  1058. }
  1059. reginput = scan;
  1060. return (count);
  1061. }
  1062. /*
  1063. - regnext - dig the "next" pointer out of a node
  1064. */
  1065. static const char* regnext(const char* p)
  1066. {
  1067. int offset;
  1068. if (p == &regdummy)
  1069. return (0);
  1070. offset = NEXT(p);
  1071. if (offset == 0)
  1072. return (0);
  1073. if (OP(p) == BACK)
  1074. return (p - offset);
  1075. else
  1076. return (p + offset);
  1077. }
  1078. static char* regnext(char* p)
  1079. {
  1080. int offset;
  1081. if (p == &regdummy)
  1082. return (0);
  1083. offset = NEXT(p);
  1084. if (offset == 0)
  1085. return (0);
  1086. if (OP(p) == BACK)
  1087. return (p - offset);
  1088. else
  1089. return (p + offset);
  1090. }
  1091. } // namespace KWSYS_NAMESPACE