regex.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160
  1. /** BEGIN COPYRIGHT BLOCK
  2. * This Program is free software; you can redistribute it and/or modify it under
  3. * the terms of the GNU General Public License as published by the Free Software
  4. * Foundation; version 2 of the License.
  5. *
  6. * This Program is distributed in the hope that it will be useful, but WITHOUT
  7. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  8. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  9. *
  10. * You should have received a copy of the GNU General Public License along with
  11. * this Program; if not, write to the Free Software Foundation, Inc., 59 Temple
  12. * Place, Suite 330, Boston, MA 02111-1307 USA.
  13. *
  14. * In addition, as a special exception, Red Hat, Inc. gives You the additional
  15. * right to link the code of this Program with code not covered under the GNU
  16. * General Public License ("Non-GPL Code") and to distribute linked combinations
  17. * including the two, subject to the limitations in this paragraph. Non-GPL Code
  18. * permitted under this exception must only link to the code of this Program
  19. * through those well defined interfaces identified in the file named EXCEPTION
  20. * found in the source code files (the "Approved Interfaces"). The files of
  21. * Non-GPL Code may instantiate templates or use macros or inline functions from
  22. * the Approved Interfaces without causing the resulting work to be covered by
  23. * the GNU General Public License. Only Red Hat, Inc. may make changes or
  24. * additions to the list of Approved Interfaces. You must obey the GNU General
  25. * Public License in all respects for all of the Program code and other code used
  26. * in conjunction with the Program except the Non-GPL Code covered by this
  27. * exception. If you modify this file, you may extend this exception to your
  28. * version of the file, but you are not obligated to do so. If you do not wish to
  29. * provide this exception without modification, you must delete this exception
  30. * statement from your version and license this file solely under the GPL without
  31. * exception.
  32. *
  33. *
  34. * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
  35. * Copyright (C) 2005 Red Hat, Inc.
  36. * All rights reserved.
  37. * END COPYRIGHT BLOCK **/
  38. #ifdef HAVE_CONFIG_H
  39. # include <config.h>
  40. #endif
  41. #include "slap.h" /* must come before regex.h */
  42. #include "portable.h"
  43. #if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || defined( NEED_BSDREGEX )
  44. #include "regex.h"
  45. /*
  46. * regex - Regular expression pattern matching and replacement
  47. *
  48. * By: Ozan S. Yigit (oz)
  49. * Dept. of Computer Science
  50. * York University
  51. *
  52. * These routines are the PUBLIC DOMAIN equivalents of regex
  53. * routines as found in 4.nBSD UN*X, with minor extensions.
  54. *
  55. * These routines are derived from various implementations found
  56. * in software tools books, and Conroy's grep. They are NOT derived
  57. * from licensed/restricted software.
  58. * For more interesting/academic/complicated implementations,
  59. * see Henry Spencer's regexp routines, or GNU Emacs pattern
  60. * matching module.
  61. *
  62. * Modification history:
  63. *
  64. * $Log: regex.c,v $
  65. * Revision 1.8 2009/01/12 18:14:57 nkinder
  66. * Resolves: 174394
  67. * Summary: Make regex filter code handle empty values properly.
  68. *
  69. * Revision 1.7 2008/06/30 17:28:16 nhosoi
  70. * Resoves: #448831
  71. * Summary: attacker can tie up CPU in regex code
  72. * Description: when substring search is requested, sets the time limit based upon
  73. * the nsslapd-timelimit value. Pass the timelimit (time_up) to the regular
  74. * expression function. When the time is up, it returns the "Timelimit exceeded"
  75. * error. Note: timelimit is applied non-Directory Manager users.
  76. *
  77. * Revision 1.6 2008/04/29 00:38:36 nhosoi
  78. * Resolves: #182621 (#443955)
  79. * Summary: Allow larger regex buffer to enable long substring filters
  80. * Description: Applying the patches provided by [email protected].
  81. * regex.c: use dynamically allocated regex buffer, use ptrdiff_t to store the offsets to be restored after the realloc, and use a constant for the value of "how much the NFA buffer can grow in one iteration on the pattern".
  82. * string.c: use dynamically allocated buffer if the prepared buffer is not large enough, used wrong pointer (pat instead of p) in a debug message, and performed an unneeded strcat of ".*"
  83. *
  84. * Revision 1.5 2006/11/10 23:45:40 nhosoi
  85. * Resolves: #214533
  86. * Summary: configure needs to support --with-fhs (Comment #6)
  87. * Changes: Added the following include next to the end of the copyright block.
  88. * +
  89. * +#ifdef HAVE_CONFIG_H
  90. * +# include <config.h>
  91. * +#endif
  92. * +
  93. *
  94. * Revision 1.4 2005/04/19 22:07:37 nkinder
  95. * Fixed licensing typo
  96. *
  97. * Revision 1.3 2005/04/15 22:40:35 nkinder
  98. * 155068 - Added license to source files
  99. *
  100. * Revision 1.2 2005/02/28 23:38:00 nkinder
  101. * 149951 - Updated source code copyrights
  102. *
  103. * Revision 1.1.1.1 2005/01/21 00:40:51 cvsadm
  104. * Moving NSCP Directory Server from DirectoryBranch to TRUNK, initial drop. (foxworth)
  105. *
  106. * Revision 1.3.20.1.2.11 2005/01/14 01:22:12 nhosoi
  107. * For the open-source project.
  108. * 1) eliminated 'netsite' level
  109. * 2) moved ns/config one level lower
  110. * 3) moved fasttime to lib/base
  111. *
  112. * Revision 1.3.20.1.2.10 2004/10/01 18:46:09 dboreham
  113. * Rename the built in regex functions to avoid colliding with the native OS versions, where presnet
  114. *
  115. * Revision 1.2 2004/10/01 06:29:11 david
  116. * rename regex functions to avoid collision with native OS functions on Solaris
  117. *
  118. * Revision 1.1.1.1 2004/06/03 22:32:48 telackey
  119. * Initial import Thu Jun 3 15:32:43 PDT 2004
  120. *
  121. * Revision 1.3.20.1.2.9 2003/09/22 19:42:12 ulfw
  122. * Update copyright years from 2001 to 2001-2003
  123. *
  124. * Revision 1.3.20.1.2.8 2001/11/03 00:13:55 richm
  125. * XXX use new copyright XXX
  126. *
  127. * Revision 1.3.20.1.2.7 2001/10/07 00:59:03 richm
  128. * ldapserver/ldap/servers/slapd/regex.c
  129. * 1.3.20.1.2.7
  130. * 20010918
  131. *
  132. * Remove copyright caracter form copyright
  133. *
  134. *
  135. * ====================================================
  136. *
  137. * Revision 1.3.20.1.2.7 2001/09/18 11:43:06 rmarco
  138. * Remove copyright caracter form copyright
  139. *
  140. * Revision 1.3.20.1.2.6 2001/02/13 09:45:16 rmarco
  141. * copyrights
  142. *
  143. * Revision 1.3.20.1.2.5 1999/12/06 10:49:33 robey
  144. * fix gcc warning
  145. *
  146. * Revision 1.3.20.1.2.4 1999/11/18 01:26:26 robey
  147. * fix gcc warnings
  148. *
  149. * Revision 1.3.20.1.2.3 1999/08/20 23:13:33 merrells
  150. * tidy up extern things
  151. *
  152. * Revision 1.3.20.1.2.2 1999/05/19 23:41:58 merrells
  153. * Keep the Solaris compiler quiet
  154. *
  155. * Revision 1.3.20.1.2.1 1999/05/14 18:39:50 merrells
  156. * value manipulation code extraction and reworking.
  157. *
  158. * Revision 1.3.20.1 1998/10/10 02:28:25 ggood
  159. * Copy from Directory40RtmBranchpoint to DirectoryBranch
  160. *
  161. * Revision 1.3.10.5 1997/12/11 23:12:01 kristian
  162. * fix bugs 97502, 97504 & 96569: handle 8-bit char's (especially UTF-8) correctly.
  163. *
  164. * Revision 1.3.10.4 1997/12/07 22:13:08 howes
  165. * Always compile re_init(), re_lock(), and re_unlock(), even
  166. * on platforms where we use the native regex stuff.
  167. *
  168. * Revision 1.3.10.3 1997/12/07 00:01:16 howes
  169. * Add re_init(), re_lock(), and re_unlock() routines, to avoid race
  170. * between acl and syntax code.
  171. *
  172. * Revision 1.3.10.2 1997/07/17 07:49:13 mcs
  173. * merge changes made on ldapsdk_10_branch into server3_directory_branch
  174. *
  175. * Revision 1.3.10.1 1997/03/27 06:39:33 ggood
  176. * Fix up more compiler warnings.
  177. *
  178. * Revision 1.3 1996/11/07 00:44:44 mcs
  179. * eliminate a few compiler warnings
  180. *
  181. * Revision 1.2 1996/11/05 21:38:05 kristian
  182. * copied from Directory_1996_11_04
  183. *
  184. * Revision 1.1.1.1.2.1 1996/05/07 19:54:53 kristian
  185. * Merged UMich ldap-3_3 into Dogbert
  186. *
  187. * Revision 1.1.1.2 1996/05/04 19:11:02 kristian
  188. * UMich version 3.3
  189. *
  190. * Revision 1.2 1996/04/25 16:24:11 mcs
  191. * make re_exec() match "" with ".*" and similar patterns
  192. * hopefully this change doesn't break anything else!
  193. *
  194. * Revision 1.1 1995/02/03 15:56:52 tim
  195. * Initial revision
  196. *
  197. * Revision 1.11 1994/12/14 21:33:45 mcs
  198. * use new NEED_BSDREGEX
  199. * fix pmatch() prototype
  200. *
  201. * Revision 1.10 1994/12/12 18:16:39 mcs
  202. * use on NetBSD
  203. *
  204. * Revision 1.9 1994/11/15 19:16:35 mcs
  205. * add (CHAR) cast to make VisualC++ happy
  206. *
  207. * Revision 1.8 1994/11/08 21:14:32 mcs
  208. * WIN32 changes
  209. *
  210. * Revision 1.7 1994/07/23 19:51:24 mcs
  211. * use ANSI-style inline function parameters
  212. *
  213. * Revision 1.6 1993/10/18 01:52:32 tim
  214. * include for VMS
  215. *
  216. * Revision 1.5 1993/09/28 21:37:54 mcs
  217. * HP/UX needs the regex we include (not in its libc)
  218. *
  219. * Revision 1.4 1993/08/27 15:59:52 mcs
  220. * use CHAR for deftab
  221. *
  222. * Revision 1.3 1993/08/27 15:49:47 mcs
  223. * added missing 0 to octal constants
  224. * use unsigned char for CHAR under DOS
  225. *
  226. * Revision 1.2 1993/08/27 14:57:48 mcs
  227. * add proto. for pmatch
  228. *
  229. * Revision 1.1 1993/08/18 21:20:02 mcs
  230. * Initial revision
  231. *
  232. * Revision 1.4 1991/10/17 03:56:42 oz
  233. * miscellaneous changes, small cleanups etc.
  234. *
  235. * Revision 1.3 1989/04/01 14:18:09 oz
  236. * Change all references to a dfa: this is actually an nfa.
  237. *
  238. * Revision 1.2 88/08/28 15:36:04 oz
  239. * Use a complement bitmap to represent NCL.
  240. * This removes the need to have seperate
  241. * code in the pmatch case block - it is
  242. * just CCL code now.
  243. *
  244. * Use the actual CCL code in the CLO
  245. * section of pmatch. No need for a recursive
  246. * pmatch call.
  247. *
  248. * Use a bitmap table to set char bits in an
  249. * 8-bit chunk.
  250. *
  251. * Interfaces:
  252. * The following three interfaces were added to avoid a race
  253. * condition in slapd. The better long-term solution is to make
  254. * the regex code thread-safe, by passing in the buffers needed.
  255. *
  256. * re_init: initializes the regex system. must be called
  257. * before any other regex calls are made.
  258. *
  259. * re_lock: locks the regex system. must be called to avoid
  260. * races between calls to re_comp and re_exec.
  261. *
  262. * re_unlock unlocks the regex system. must be called after
  263. * a set of calls to re_comp and re_exec.
  264. *
  265. * re_comp: compile a regular expression into a NFA.
  266. *
  267. * char *re_comp(s)
  268. * char *s;
  269. *
  270. * re_exec: execute the NFA to match a pattern.
  271. *
  272. * int re_exec(s)
  273. * char *s;
  274. *
  275. * re_modw change re_exec's understanding of what a "word"
  276. * looks like (for \< and \>) by adding into the
  277. * hidden word-syntax table.
  278. *
  279. * void re_modw(s)
  280. * char *s;
  281. *
  282. * re_subs: substitute the matched portions in a new string.
  283. *
  284. * int re_subs(src, dst)
  285. * char *src;
  286. * char *dst;
  287. *
  288. * re_fail: failure routine for re_exec.
  289. *
  290. * void re_fail(msg, op)
  291. * char *msg;
  292. * char op;
  293. *
  294. * Regular Expressions:
  295. *
  296. * [1] char matches itself, unless it is a special
  297. * character (metachar): . \ [ ] * + ^ $
  298. *
  299. * [2] . matches any character.
  300. *
  301. * [3] \ matches the character following it, except
  302. * when followed by a left or right round bracket,
  303. * a digit 1 to 9 or a left or right angle bracket.
  304. * (see [7], [8] and [9])
  305. * It is used as an escape character for all
  306. * other meta-characters, and itself. When used
  307. * in a set ([4]), it is treated as an ordinary
  308. * character.
  309. *
  310. * [4] [set] matches one of the characters in the set.
  311. * If the first character in the set is "^",
  312. * it matches a character NOT in the set, i.e.
  313. * complements the set. A shorthand S-E is
  314. * used to specify a set of characters S upto
  315. * E, inclusive. The special characters "]" and
  316. * "-" have no special meaning if they appear
  317. * as the first chars in the set.
  318. * examples: match:
  319. *
  320. * [a-z] any lowercase alpha
  321. *
  322. * [^]-] any char except ] and -
  323. *
  324. * [^A-Z] any char except uppercase
  325. * alpha
  326. *
  327. * [a-zA-Z] any alpha
  328. *
  329. * [5] * any regular expression form [1] to [4], followed by
  330. * closure char (*) matches zero or more matches of
  331. * that form.
  332. *
  333. * [6] + same as [5], except it matches one or more.
  334. *
  335. * [7] a regular expression in the form [1] to [10], enclosed
  336. * as \(form\) matches what form matches. The enclosure
  337. * creates a set of tags, used for [8] and for
  338. * pattern substution. The tagged forms are numbered
  339. * starting from 1.
  340. *
  341. * [8] a \ followed by a digit 1 to 9 matches whatever a
  342. * previously tagged regular expression ([7]) matched.
  343. *
  344. * [9] \< a regular expression starting with a \< construct
  345. * \> and/or ending with a \> construct, restricts the
  346. * pattern matching to the beginning of a word, and/or
  347. * the end of a word. A word is defined to be a character
  348. * string beginning and/or ending with the characters
  349. * A-Z a-z 0-9 and _. It must also be preceded and/or
  350. * followed by any character outside those mentioned.
  351. *
  352. * [10] a composite regular expression xy where x and y
  353. * are in the form [1] to [10] matches the longest
  354. * match of x followed by a match for y.
  355. *
  356. * [11] ^ a regular expression starting with a ^ character
  357. * $ and/or ending with a $ character, restricts the
  358. * pattern matching to the beginning of the line,
  359. * or the end of line. [anchors] Elsewhere in the
  360. * pattern, ^ and $ are treated as ordinary characters.
  361. *
  362. *
  363. * Acknowledgements:
  364. *
  365. * HCR's Hugh Redelmeier has been most helpful in various
  366. * stages of development. He convinced me to include BOW
  367. * and EOW constructs, originally invented by Rob Pike at
  368. * the University of Toronto.
  369. *
  370. * References:
  371. * Software tools Kernighan & Plauger
  372. * Software tools in Pascal Kernighan & Plauger
  373. * Grep [rsx-11 C dist] David Conroy
  374. * ed - text editor Un*x Programmer's Manual
  375. * Advanced editing on Un*x B. W. Kernighan
  376. * RegExp routines Henry Spencer
  377. *
  378. * Notes:
  379. *
  380. * This implementation uses a bit-set representation for character
  381. * classes for speed and compactness. Each character is represented
  382. * by one bit in a 128-bit block. Thus, CCL always takes a
  383. * constant 16 bytes in the internal nfa, and re_exec does a single
  384. * bit comparison to locate the character in the set.
  385. *
  386. * Examples:
  387. *
  388. * pattern: foo*.*
  389. * compile: CHR f CHR o CLO CHR o END CLO ANY END END
  390. * matches: fo foo fooo foobar fobar foxx ...
  391. *
  392. * pattern: fo[ob]a[rz]
  393. * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
  394. * matches: fobar fooar fobaz fooaz
  395. *
  396. * pattern: foo\\+
  397. * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
  398. * matches: foo\ foo\\ foo\\\ ...
  399. *
  400. * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
  401. * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
  402. * matches: foo1foo foo2foo foo3foo
  403. *
  404. * pattern: \(fo.*\)-\1
  405. * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
  406. * matches: foo-foo fo-fo fob-fob foobar-foobar ...
  407. */
  408. /* This is the maximum the NFA buffer might grow for every op code processed.
  409. The max seems to be the + after a character class, like "[a-z]+". It
  410. needs 1 byte for the CCL code, 16 for the CCL bit map, and 2 for END codes
  411. and 1 for a CLO code. */
  412. #define MAXOPSPACE 20
  413. #define MAXNFA 1024
  414. #define MAXTAG 10
  415. #define OKP 1
  416. #define NOP 0
  417. #define CHR 1
  418. #define ANY 2
  419. #define CCL 3
  420. #define BOL 4
  421. #define EOL 5
  422. #define BOT 6
  423. #define EOT 7
  424. #define BOW 8
  425. #define EOW 9
  426. #define REF 10
  427. #define CLO 11
  428. #define END 0
  429. /*
  430. * The following defines are not meant to be changeable.
  431. * They are for readability only.
  432. */
  433. #define MAXCHR 128
  434. #define CHRBIT 8
  435. #define BITBLK MAXCHR/CHRBIT
  436. #define BLKIND 0170
  437. #define BITIND 07
  438. #define ASCIIB 0177
  439. typedef unsigned char UCHAR;
  440. /* char, on the other hand, may be signed or unsigned;
  441. * it's platform-dependent. A hard fact of life, in C.
  442. */
  443. static int tagstk[MAXTAG]; /* subpat tag stack..*/
  444. static UCHAR *nfa = NULL; /* automaton.. */
  445. static int nfasize = MAXNFA; /* tracks size of nfa buffer */
  446. static int sta = NOP; /* status of lastpat */
  447. static UCHAR bittab[BITBLK]; /* bit table for CCL */
  448. /* pre-set bits... */
  449. static UCHAR bitarr[] = {1,2,4,8,16,32,64,128};
  450. #ifdef DEBUG
  451. static void nfadump( UCHAR *ap);
  452. #endif
  453. static void
  454. chset(UCHAR c)
  455. {
  456. bittab[((c) & (unsigned)BLKIND) >> 3] |= bitarr[(c) & BITIND];
  457. }
  458. #define badpat(x) (*nfa = END, x)
  459. #define store(x) *mp++ = x
  460. char *
  461. slapd_re_comp( char *pat )
  462. {
  463. register UCHAR *p; /* pattern pointer */
  464. register UCHAR *mp=nfa; /* nfa pointer */
  465. register UCHAR *lp; /* saved pointer.. */
  466. register UCHAR *sp=nfa; /* another one.. */
  467. register int tagi = 0; /* tag stack index */
  468. register int tagc = 1; /* actual tag count */
  469. register int n;
  470. register UCHAR mask; /* xor mask -CCL/NCL */
  471. int c1, c2;
  472. if (!pat || !*pat) {
  473. if (sta)
  474. return 0;
  475. else
  476. return badpat("No previous regular expression");
  477. }
  478. sta = NOP;
  479. for (p = (UCHAR*)pat; *p; p++) {
  480. /* Check if we are approaching end of nfa buffer. MAXOPSPACE is
  481. the max we might add to the nfa per loop. */
  482. if (mp - (UCHAR*)nfa + MAXOPSPACE >= nfasize) {
  483. /* Save offsets */
  484. ptrdiff_t mppos = mp - nfa;
  485. ptrdiff_t sppos = sp - nfa;
  486. /* Double the nfa buffer size */
  487. nfasize *= 2;
  488. nfa = (UCHAR*)slapi_ch_realloc((char*)nfa, nfasize);
  489. /* Restore pointers into realloced space */
  490. mp = nfa + mppos;
  491. sp = nfa + sppos;
  492. }
  493. lp = mp;
  494. switch(*p) {
  495. case '.': /* match any char.. */
  496. store(ANY);
  497. break;
  498. case '^': /* match beginning.. */
  499. if (p == (UCHAR*)pat)
  500. store(BOL);
  501. else {
  502. store(CHR);
  503. store(*p);
  504. }
  505. break;
  506. case '$': /* match endofline.. */
  507. if (!*(p+1))
  508. store(EOL);
  509. else {
  510. store(CHR);
  511. store(*p);
  512. }
  513. break;
  514. case '[': /* match char class..*/
  515. store(CCL);
  516. if (*++p == '^') {
  517. mask = 0377;
  518. p++;
  519. }
  520. else
  521. mask = 0;
  522. if (*p == '-') /* real dash */
  523. chset(*p++);
  524. if (*p == ']') /* real brac */
  525. chset(*p++);
  526. while (*p && *p != ']') {
  527. if (*p == '-' && *(p+1) && *(p+1) != ']') {
  528. p++;
  529. c1 = *(p-2) + 1;
  530. c2 = *p++;
  531. while (c1 <= c2)
  532. chset((UCHAR)c1++);
  533. }
  534. #ifdef EXTEND
  535. else if (*p == '\\' && *(p+1)) {
  536. p++;
  537. chset(*p++);
  538. }
  539. #endif
  540. else
  541. chset(*p++);
  542. }
  543. if (!*p)
  544. return badpat("Missing ]");
  545. for (n = 0; n < BITBLK; bittab[n++] = (UCHAR) 0)
  546. store(mask ^ bittab[n]);
  547. break;
  548. case '*': /* match 0 or more.. */
  549. case '+': /* match 1 or more.. */
  550. if (p == (UCHAR*)pat)
  551. return badpat("Empty closure");
  552. lp = sp; /* previous opcode */
  553. if (*lp == CLO) /* equivalence.. */
  554. break;
  555. switch(*lp) {
  556. case BOL:
  557. case BOT:
  558. case EOT:
  559. case BOW:
  560. case EOW:
  561. case REF:
  562. return badpat("Illegal closure");
  563. default:
  564. break;
  565. }
  566. if (*p == '+')
  567. for (sp = mp; lp < sp; lp++)
  568. store(*lp);
  569. store(END);
  570. store(END);
  571. sp = mp;
  572. while (--mp > lp)
  573. *mp = mp[-1];
  574. store(CLO);
  575. mp = sp;
  576. break;
  577. case '\\': /* tags, backrefs .. */
  578. switch(*++p) {
  579. case '(':
  580. if (tagc < MAXTAG) {
  581. tagstk[++tagi] = tagc;
  582. store(BOT);
  583. store(tagc++);
  584. }
  585. else
  586. return badpat("Too many \\(\\) pairs");
  587. break;
  588. case ')':
  589. if (*sp == BOT)
  590. return badpat("Null pattern inside \\(\\)");
  591. if (tagi > 0) {
  592. store(EOT);
  593. store(tagstk[tagi--]);
  594. }
  595. else
  596. return badpat("Unmatched \\)");
  597. break;
  598. case '<':
  599. store(BOW);
  600. break;
  601. case '>':
  602. if (*sp == BOW)
  603. return badpat("Null pattern inside \\<\\>");
  604. store(EOW);
  605. break;
  606. case '1':
  607. case '2':
  608. case '3':
  609. case '4':
  610. case '5':
  611. case '6':
  612. case '7':
  613. case '8':
  614. case '9':
  615. n = *p-'0';
  616. if (tagi > 0 && tagstk[tagi] == n)
  617. return badpat("Cyclical reference");
  618. if (tagc > n) {
  619. store(REF);
  620. store(n);
  621. }
  622. else
  623. return badpat("Undetermined reference");
  624. break;
  625. #ifdef EXTEND
  626. case 'b':
  627. store(CHR);
  628. store('\b');
  629. break;
  630. case 'n':
  631. store(CHR);
  632. store('\n');
  633. break;
  634. case 'f':
  635. store(CHR);
  636. store('\f');
  637. break;
  638. case 'r':
  639. store(CHR);
  640. store('\r');
  641. break;
  642. case 't':
  643. store(CHR);
  644. store('\t');
  645. break;
  646. #endif
  647. default:
  648. store(CHR);
  649. store(*p);
  650. }
  651. break;
  652. default : /* an ordinary char */
  653. store(CHR);
  654. store(*p);
  655. break;
  656. }
  657. sp = lp;
  658. }
  659. if (tagi > 0)
  660. return badpat("Unmatched \\(");
  661. store(END);
  662. sta = OKP;
  663. return 0;
  664. }
  665. static UCHAR *bol;
  666. static UCHAR *bopat[MAXTAG];
  667. static UCHAR *eopat[MAXTAG];
  668. #ifdef NEEDPROTOS
  669. static UCHAR *pmatch( UCHAR *lp, UCHAR *ap, time_t time_up, int *err );
  670. #else /* NEEDPROTOS */
  671. static UCHAR *pmatch();
  672. #endif /* NEEDPROTOS */
  673. /*
  674. * re_exec:
  675. * execute nfa to find a match.
  676. *
  677. * special cases: (nfa[0])
  678. * BOL
  679. * Match only once, starting from the
  680. * beginning.
  681. * CHR
  682. * First locate the character without
  683. * calling pmatch, and if found, call
  684. * pmatch for the remaining string.
  685. * END
  686. * re_comp failed, poor luser did not
  687. * check for it. Fail fast.
  688. *
  689. * If a match is found, bopat[0] and eopat[0] are set
  690. * to the beginning and the end of the matched fragment,
  691. * respectively.
  692. *
  693. * return values: 0 -- did not match
  694. * 1 -- matched
  695. * othersise -- ldap error (TIMELIMIT_EXCEEDED only)
  696. */
  697. int
  698. slapd_re_exec( char *lp, time_t time_up )
  699. {
  700. register UCHAR c;
  701. register UCHAR *ep = 0;
  702. register UCHAR *ap = nfa;
  703. int ldaperror = 0;
  704. bol = (UCHAR*)lp;
  705. bopat[0] = 0;
  706. bopat[1] = 0;
  707. bopat[2] = 0;
  708. bopat[3] = 0;
  709. bopat[4] = 0;
  710. bopat[5] = 0;
  711. bopat[6] = 0;
  712. bopat[7] = 0;
  713. bopat[8] = 0;
  714. bopat[9] = 0;
  715. switch(*ap) {
  716. case BOL: /* anchored: match from BOL only */
  717. ep = pmatch((UCHAR*)lp,ap,time_up,&ldaperror);
  718. break;
  719. case CHR: /* ordinary char: locate it fast */
  720. c = *(ap+1);
  721. while (*lp && *(UCHAR*)lp != c)
  722. lp++;
  723. if (!*lp) /* if EOS, fail, else fall thru. */
  724. return 0;
  725. default: /* regular matching all the way. */
  726. do {
  727. if ((ep = pmatch((UCHAR*)lp,ap,time_up,&ldaperror)))
  728. break;
  729. if (*lp)
  730. lp++;
  731. } while (*lp);
  732. break;
  733. case END: /* munged automaton. fail always */
  734. return 0;
  735. }
  736. if (ldaperror)
  737. return ldaperror;
  738. if (!ep)
  739. return 0;
  740. bopat[0] = (UCHAR*)lp;
  741. eopat[0] = ep;
  742. return 1;
  743. }
  744. /*
  745. * pmatch: internal routine for the hard part
  746. *
  747. * This code is partly snarfed from an early grep written by
  748. * David Conroy. The backref and tag stuff, and various other
  749. * innovations are by oz.
  750. *
  751. * special case optimizations: (nfa[n], nfa[n+1])
  752. * CLO ANY
  753. * We KNOW .* will match everything upto the
  754. * end of line. Thus, directly go to the end of
  755. * line, without recursive pmatch calls. As in
  756. * the other closure cases, the remaining pattern
  757. * must be matched by moving backwards on the
  758. * string recursively, to find a match for xy
  759. * (x is ".*" and y is the remaining pattern)
  760. * where the match satisfies the LONGEST match for
  761. * x followed by a match for y.
  762. * CLO CHR
  763. * We can again scan the string forward for the
  764. * single char and at the point of failure, we
  765. * execute the remaining nfa recursively, same as
  766. * above.
  767. *
  768. * At the end of a successful match, bopat[n] and eopat[n]
  769. * are set to the beginning and end of subpatterns matched
  770. * by tagged expressions (n = 1 to 9).
  771. *
  772. */
  773. #ifndef re_fail
  774. void re_fail();
  775. #endif /* re_fail */
  776. /*
  777. * character classification table for word boundary operators BOW
  778. * and EOW. the reason for not using ctype macros is that we can
  779. * let the user add into our own table. see re_modw. This table
  780. * is not in the bitset form, since we may wish to extend it in the
  781. * future for other character classifications.
  782. *
  783. * TRUE for 0-9 A-Z a-z _
  784. */
  785. static char chrtyp[MAXCHR] = {
  786. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  787. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  788. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  789. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  790. 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
  791. 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
  792. 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
  793. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  794. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  795. 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
  796. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  797. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  798. 1, 1, 1, 0, 0, 0, 0, 0
  799. };
  800. #define inascii(x) (0177&(x))
  801. #define iswordc(x) chrtyp[inascii(x)]
  802. #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
  803. /*
  804. * skip values for CLO XXX to skip past the closure
  805. */
  806. #define ANYSKIP 2 /* [CLO] ANY END ... */
  807. #define CHRSKIP 3 /* [CLO] CHR chr END ... */
  808. #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
  809. static UCHAR *
  810. pmatch( UCHAR *lp, UCHAR *ap, time_t time_up, int *err )
  811. {
  812. register int op, c, n;
  813. register UCHAR *e; /* extra pointer for CLO */
  814. register UCHAR *bp; /* beginning of subpat.. */
  815. register UCHAR *ep; /* ending of subpat.. */
  816. UCHAR *are; /* to save the line ptr. */
  817. time_t curtime = current_time();
  818. if ( time_up != -1 && curtime > time_up ) {
  819. *err = LDAP_TIMELIMIT_EXCEEDED;
  820. return 0;
  821. }
  822. while ((op = *ap++) != END)
  823. switch(op) {
  824. case CHR:
  825. if (*lp++ != *ap++)
  826. return 0;
  827. break;
  828. case ANY:
  829. if (!*lp++)
  830. return 0;
  831. break;
  832. case CCL:
  833. c = *lp++;
  834. if (!isinset(ap,c))
  835. return 0;
  836. ap += BITBLK;
  837. break;
  838. case BOL:
  839. if (lp != bol)
  840. return 0;
  841. break;
  842. case EOL:
  843. if (*lp)
  844. return 0;
  845. break;
  846. case BOT:
  847. bopat[*ap++] = lp;
  848. break;
  849. case EOT:
  850. eopat[*ap++] = lp;
  851. break;
  852. case BOW:
  853. if ((lp!=bol && iswordc(lp[-1])) || !iswordc(*lp))
  854. return 0;
  855. break;
  856. case EOW:
  857. if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
  858. return 0;
  859. break;
  860. case REF:
  861. n = *ap++;
  862. bp = bopat[n];
  863. ep = eopat[n];
  864. while (bp < ep)
  865. if (*bp++ != *lp++)
  866. return 0;
  867. break;
  868. case CLO:
  869. are = lp;
  870. switch(*ap) {
  871. case ANY:
  872. while (*lp)
  873. lp++;
  874. n = ANYSKIP;
  875. break;
  876. case CHR:
  877. c = *(ap+1);
  878. while (*lp && c == *lp)
  879. lp++;
  880. n = CHRSKIP;
  881. break;
  882. case CCL:
  883. while ((c = *lp) && isinset(ap+1,c))
  884. lp++;
  885. n = CCLSKIP;
  886. break;
  887. default:
  888. re_fail("closure: bad nfa.", *ap);
  889. return 0;
  890. }
  891. ap += n;
  892. while (lp >= are) {
  893. if ((e = pmatch(lp, ap, time_up, err)) != NULL)
  894. return e;
  895. --lp;
  896. }
  897. return 0;
  898. default:
  899. re_fail("re_exec: bad nfa.", op);
  900. return 0;
  901. }
  902. return lp;
  903. }
  904. /*
  905. * re_modw:
  906. * add new characters into the word table to change re_exec's
  907. * understanding of what a word should look like. Note that we
  908. * only accept additions into the word definition.
  909. *
  910. * If the string parameter is 0 or null string, the table is
  911. * reset back to the default containing A-Z a-z 0-9 _. [We use
  912. * the compact bitset representation for the default table]
  913. */
  914. static UCHAR deftab[16] = {
  915. 0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
  916. 0376, 0377, 0377, 007
  917. };
  918. void
  919. slapd_re_modw( char *s )
  920. {
  921. register int i;
  922. if (!s || !*s) {
  923. for (i = 0; i < MAXCHR; i++)
  924. if (!isinset(deftab,i))
  925. iswordc(i) = 0;
  926. }
  927. else
  928. while(*s)
  929. iswordc(*s++) = 1;
  930. }
  931. /*
  932. * re_subs:
  933. * substitute the matched portions of the src in dst.
  934. *
  935. * & substitute the entire matched pattern.
  936. *
  937. * \digit substitute a subpattern, with the given tag number.
  938. * Tags are numbered from 1 to 9. If the particular
  939. * tagged subpattern does not exist, null is substituted.
  940. */
  941. int
  942. slapd_re_subs( char *src, char *dst)
  943. {
  944. register char c;
  945. register int pin;
  946. register UCHAR *bp;
  947. register UCHAR *ep;
  948. if (!*src || !bopat[0])
  949. return 0;
  950. while ((c = *src++) != 0) {
  951. switch(c) {
  952. case '&':
  953. pin = 0;
  954. break;
  955. case '\\':
  956. c = *src++;
  957. if (c >= '0' && c <= '9') {
  958. pin = c - '0';
  959. break;
  960. }
  961. default:
  962. *dst++ = c;
  963. continue;
  964. }
  965. if ((bp = bopat[pin]) && (ep = eopat[pin])) {
  966. while (*bp && bp < ep)
  967. *dst++ = *(char*)bp++;
  968. if (bp < ep)
  969. return 0;
  970. }
  971. }
  972. *dst = (char) 0;
  973. return 1;
  974. }
  975. #ifdef DEBUG
  976. /*
  977. * symbolic - produce a symbolic dump of the nfa
  978. */
  979. void
  980. symbolic( char *s )
  981. {
  982. printf("pattern: %s\n", s);
  983. printf("nfacode:\n");
  984. nfadump(nfa);
  985. }
  986. static void
  987. nfadump( UCHAR *ap)
  988. {
  989. register int n;
  990. while (*ap != END)
  991. switch(*ap++) {
  992. case CLO:
  993. printf("CLOSURE");
  994. nfadump(ap);
  995. switch(*ap) {
  996. case CHR:
  997. n = CHRSKIP;
  998. break;
  999. case ANY:
  1000. n = ANYSKIP;
  1001. break;
  1002. case CCL:
  1003. n = CCLSKIP;
  1004. break;
  1005. }
  1006. ap += n;
  1007. break;
  1008. case CHR:
  1009. printf("\tCHR %c\n",*ap++);
  1010. break;
  1011. case ANY:
  1012. printf("\tANY .\n");
  1013. break;
  1014. case BOL:
  1015. printf("\tBOL -\n");
  1016. break;
  1017. case EOL:
  1018. printf("\tEOL -\n");
  1019. break;
  1020. case BOT:
  1021. printf("BOT: %d\n",*ap++);
  1022. break;
  1023. case EOT:
  1024. printf("EOT: %d\n",*ap++);
  1025. break;
  1026. case BOW:
  1027. printf("BOW\n");
  1028. break;
  1029. case EOW:
  1030. printf("EOW\n");
  1031. break;
  1032. case REF:
  1033. printf("REF: %d\n",*ap++);
  1034. break;
  1035. case CCL:
  1036. printf("\tCCL [");
  1037. for (n = 0; n < MAXCHR; n++)
  1038. if (isinset(ap,(UCHAR)n)) {
  1039. if (n < ' ')
  1040. printf("^%c", n ^ 0x040);
  1041. else
  1042. printf("%c", n);
  1043. }
  1044. printf("]\n");
  1045. ap += BITBLK;
  1046. break;
  1047. default:
  1048. printf("bad nfa. opcode %o\n", ap[-1]);
  1049. exit(1);
  1050. break;
  1051. }
  1052. }
  1053. #endif
  1054. #endif /* MACOS or DOS or NEED_BSDREGEX */
  1055. static PRLock *regex_mutex = NULL;
  1056. int
  1057. slapd_re_init( void )
  1058. {
  1059. if ( NULL == regex_mutex ) {
  1060. regex_mutex = PR_NewLock();
  1061. }
  1062. if ( NULL == nfa ) {
  1063. nfa = (UCHAR*)slapi_ch_malloc( MAXNFA );
  1064. }
  1065. return( NULL == regex_mutex ? -1 : 0 );
  1066. }
  1067. void
  1068. slapd_re_lock( void )
  1069. {
  1070. PR_ASSERT( NULL != regex_mutex );
  1071. PR_Lock( regex_mutex );
  1072. }
  1073. int
  1074. slapd_re_unlock( void )
  1075. {
  1076. PR_ASSERT( NULL != regex_mutex );
  1077. return( PR_Unlock( regex_mutex ) );
  1078. }