uregex.h 72 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614
  1. /*
  2. **********************************************************************
  3. * Copyright (C) 2004-2015, International Business Machines
  4. * Corporation and others. All Rights Reserved.
  5. **********************************************************************
  6. * file name: uregex.h
  7. * encoding: US-ASCII
  8. * indentation:4
  9. *
  10. * created on: 2004mar09
  11. * created by: Andy Heninger
  12. *
  13. * ICU Regular Expressions, API for C
  14. */
  15. /**
  16. * \file
  17. * \brief C API: Regular Expressions
  18. *
  19. * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p>
  20. */
  21. #ifndef UREGEX_H
  22. #define UREGEX_H
  23. #include "unicode/utext.h"
  24. #include "unicode/utypes.h"
  25. #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  26. #include "unicode/localpointer.h"
  27. #include "unicode/parseerr.h"
  28. struct URegularExpression;
  29. /**
  30. * Structure representing a compiled regular expression, plus the results
  31. * of a match operation.
  32. * @stable ICU 3.0
  33. */
  34. typedef struct URegularExpression URegularExpression;
  35. /**
  36. * Constants for Regular Expression Match Modes.
  37. * @stable ICU 2.4
  38. */
  39. typedef enum URegexpFlag{
  40. #ifndef U_HIDE_DRAFT_API
  41. /** Forces normalization of pattern and strings.
  42. Not implemented yet, just a placeholder, hence draft.
  43. @draft ICU 2.4 */
  44. UREGEX_CANON_EQ = 128,
  45. #endif /* U_HIDE_DRAFT_API */
  46. /** Enable case insensitive matching. @stable ICU 2.4 */
  47. UREGEX_CASE_INSENSITIVE = 2,
  48. /** Allow white space and comments within patterns @stable ICU 2.4 */
  49. UREGEX_COMMENTS = 4,
  50. /** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
  51. * @stable ICU 2.4 */
  52. UREGEX_DOTALL = 32,
  53. /** If set, treat the entire pattern as a literal string.
  54. * Metacharacters or escape sequences in the input sequence will be given
  55. * no special meaning.
  56. *
  57. * The flag UREGEX_CASE_INSENSITIVE retains its impact
  58. * on matching when used in conjunction with this flag.
  59. * The other flags become superfluous.
  60. *
  61. * @stable ICU 4.0
  62. */
  63. UREGEX_LITERAL = 16,
  64. /** Control behavior of "$" and "^"
  65. * If set, recognize line terminators within string,
  66. * otherwise, match only at start and end of input string.
  67. * @stable ICU 2.4 */
  68. UREGEX_MULTILINE = 8,
  69. /** Unix-only line endings.
  70. * When this mode is enabled, only \\u000a is recognized as a line ending
  71. * in the behavior of ., ^, and $.
  72. * @stable ICU 4.0
  73. */
  74. UREGEX_UNIX_LINES = 1,
  75. /** Unicode word boundaries.
  76. * If set, \b uses the Unicode TR 29 definition of word boundaries.
  77. * Warning: Unicode word boundaries are quite different from
  78. * traditional regular expression word boundaries. See
  79. * http://unicode.org/reports/tr29/#Word_Boundaries
  80. * @stable ICU 2.8
  81. */
  82. UREGEX_UWORD = 256,
  83. /** Error on Unrecognized backslash escapes.
  84. * If set, fail with an error on patterns that contain
  85. * backslash-escaped ASCII letters without a known special
  86. * meaning. If this flag is not set, these
  87. * escaped letters represent themselves.
  88. * @stable ICU 4.0
  89. */
  90. UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
  91. } URegexpFlag;
  92. /**
  93. * Open (compile) an ICU regular expression. Compiles the regular expression in
  94. * string form into an internal representation using the specified match mode flags.
  95. * The resulting regular expression handle can then be used to perform various
  96. * matching operations.
  97. *
  98. *
  99. * @param pattern The Regular Expression pattern to be compiled.
  100. * @param patternLength The length of the pattern, or -1 if the pattern is
  101. * NUL terminated.
  102. * @param flags Flags that alter the default matching behavior for
  103. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  104. * example. For default behavior, set this parameter to zero.
  105. * See <code>enum URegexpFlag</code>. All desired flags
  106. * are bitwise-ORed together.
  107. * @param pe Receives the position (line and column numbers) of any syntax
  108. * error within the source regular expression string. If this
  109. * information is not wanted, pass NULL for this parameter.
  110. * @param status Receives error detected by this function.
  111. * @stable ICU 3.0
  112. *
  113. */
  114. U_STABLE URegularExpression * U_EXPORT2
  115. uregex_open( const UChar *pattern,
  116. int32_t patternLength,
  117. uint32_t flags,
  118. UParseError *pe,
  119. UErrorCode *status);
  120. /**
  121. * Open (compile) an ICU regular expression. Compiles the regular expression in
  122. * string form into an internal representation using the specified match mode flags.
  123. * The resulting regular expression handle can then be used to perform various
  124. * matching operations.
  125. * <p>
  126. * The contents of the pattern UText will be extracted and saved. Ownership of the
  127. * UText struct itself remains with the caller. This is to match the behavior of
  128. * uregex_open().
  129. *
  130. * @param pattern The Regular Expression pattern to be compiled.
  131. * @param flags Flags that alter the default matching behavior for
  132. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  133. * example. For default behavior, set this parameter to zero.
  134. * See <code>enum URegexpFlag</code>. All desired flags
  135. * are bitwise-ORed together.
  136. * @param pe Receives the position (line and column numbers) of any syntax
  137. * error within the source regular expression string. If this
  138. * information is not wanted, pass NULL for this parameter.
  139. * @param status Receives error detected by this function.
  140. *
  141. * @stable ICU 4.6
  142. */
  143. U_STABLE URegularExpression * U_EXPORT2
  144. uregex_openUText(UText *pattern,
  145. uint32_t flags,
  146. UParseError *pe,
  147. UErrorCode *status);
  148. /**
  149. * Open (compile) an ICU regular expression. The resulting regular expression
  150. * handle can then be used to perform various matching operations.
  151. * <p>
  152. * This function is the same as uregex_open, except that the pattern
  153. * is supplied as an 8 bit char * string in the default code page.
  154. *
  155. * @param pattern The Regular Expression pattern to be compiled,
  156. * NUL terminated.
  157. * @param flags Flags that alter the default matching behavior for
  158. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  159. * example. For default behavior, set this parameter to zero.
  160. * See <code>enum URegexpFlag</code>. All desired flags
  161. * are bitwise-ORed together.
  162. * @param pe Receives the position (line and column numbers) of any syntax
  163. * error within the source regular expression string. If this
  164. * information is not wanted, pass NULL for this parameter.
  165. * @param status Receives errors detected by this function.
  166. * @return The URegularExpression object representing the compiled
  167. * pattern.
  168. *
  169. * @stable ICU 3.0
  170. */
  171. #if !UCONFIG_NO_CONVERSION
  172. U_STABLE URegularExpression * U_EXPORT2
  173. uregex_openC( const char *pattern,
  174. uint32_t flags,
  175. UParseError *pe,
  176. UErrorCode *status);
  177. #endif
  178. /**
  179. * Close the regular expression, recovering all resources (memory) it
  180. * was holding.
  181. *
  182. * @param regexp The regular expression to be closed.
  183. * @stable ICU 3.0
  184. */
  185. U_STABLE void U_EXPORT2
  186. uregex_close(URegularExpression *regexp);
  187. #if U_SHOW_CPLUSPLUS_API
  188. U_NAMESPACE_BEGIN
  189. /**
  190. * \class LocalURegularExpressionPointer
  191. * "Smart pointer" class, closes a URegularExpression via uregex_close().
  192. * For most methods see the LocalPointerBase base class.
  193. *
  194. * @see LocalPointerBase
  195. * @see LocalPointer
  196. * @stable ICU 4.4
  197. */
  198. U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close);
  199. U_NAMESPACE_END
  200. #endif
  201. /**
  202. * Make a copy of a compiled regular expression. Cloning a regular
  203. * expression is faster than opening a second instance from the source
  204. * form of the expression, and requires less memory.
  205. * <p>
  206. * Note that the current input string and the position of any matched text
  207. * within it are not cloned; only the pattern itself and the
  208. * match mode flags are copied.
  209. * <p>
  210. * Cloning can be particularly useful to threaded applications that perform
  211. * multiple match operations in parallel. Each concurrent RE
  212. * operation requires its own instance of a URegularExpression.
  213. *
  214. * @param regexp The compiled regular expression to be cloned.
  215. * @param status Receives indication of any errors encountered
  216. * @return the cloned copy of the compiled regular expression.
  217. * @stable ICU 3.0
  218. */
  219. U_STABLE URegularExpression * U_EXPORT2
  220. uregex_clone(const URegularExpression *regexp, UErrorCode *status);
  221. /**
  222. * Returns a pointer to the source form of the pattern for this regular expression.
  223. * This function will work even if the pattern was originally specified as a UText.
  224. *
  225. * @param regexp The compiled regular expression.
  226. * @param patLength This output parameter will be set to the length of the
  227. * pattern string. A NULL pointer may be used here if the
  228. * pattern length is not needed, as would be the case if
  229. * the pattern is known in advance to be a NUL terminated
  230. * string.
  231. * @param status Receives errors detected by this function.
  232. * @return a pointer to the pattern string. The storage for the string is
  233. * owned by the regular expression object, and must not be
  234. * altered or deleted by the application. The returned string
  235. * will remain valid until the regular expression is closed.
  236. * @stable ICU 3.0
  237. */
  238. U_STABLE const UChar * U_EXPORT2
  239. uregex_pattern(const URegularExpression *regexp,
  240. int32_t *patLength,
  241. UErrorCode *status);
  242. /**
  243. * Returns the source text of the pattern for this regular expression.
  244. * This function will work even if the pattern was originally specified as a UChar string.
  245. *
  246. * @param regexp The compiled regular expression.
  247. * @param status Receives errors detected by this function.
  248. * @return the pattern text. The storage for the text is owned by the regular expression
  249. * object, and must not be altered or deleted.
  250. *
  251. * @stable ICU 4.6
  252. */
  253. U_STABLE UText * U_EXPORT2
  254. uregex_patternUText(const URegularExpression *regexp,
  255. UErrorCode *status);
  256. /**
  257. * Get the match mode flags that were specified when compiling this regular expression.
  258. * @param status Receives errors detected by this function.
  259. * @param regexp The compiled regular expression.
  260. * @return The match mode flags
  261. * @see URegexpFlag
  262. * @stable ICU 3.0
  263. */
  264. U_STABLE int32_t U_EXPORT2
  265. uregex_flags(const URegularExpression *regexp,
  266. UErrorCode *status);
  267. /**
  268. * Set the subject text string upon which the regular expression will look for matches.
  269. * This function may be called any number of times, allowing the regular
  270. * expression pattern to be applied to different strings.
  271. * <p>
  272. * Regular expression matching operations work directly on the application's
  273. * string data. No copy is made. The subject string data must not be
  274. * altered after calling this function until after all regular expression
  275. * operations involving this string data are completed.
  276. * <p>
  277. * Zero length strings are permitted. In this case, no subsequent match
  278. * operation will dereference the text string pointer.
  279. *
  280. * @param regexp The compiled regular expression.
  281. * @param text The subject text string.
  282. * @param textLength The length of the subject text, or -1 if the string
  283. * is NUL terminated.
  284. * @param status Receives errors detected by this function.
  285. * @stable ICU 3.0
  286. */
  287. U_STABLE void U_EXPORT2
  288. uregex_setText(URegularExpression *regexp,
  289. const UChar *text,
  290. int32_t textLength,
  291. UErrorCode *status);
  292. /**
  293. * Set the subject text string upon which the regular expression will look for matches.
  294. * This function may be called any number of times, allowing the regular
  295. * expression pattern to be applied to different strings.
  296. * <p>
  297. * Regular expression matching operations work directly on the application's
  298. * string data; only a shallow clone is made. The subject string data must not be
  299. * altered after calling this function until after all regular expression
  300. * operations involving this string data are completed.
  301. *
  302. * @param regexp The compiled regular expression.
  303. * @param text The subject text string.
  304. * @param status Receives errors detected by this function.
  305. *
  306. * @stable ICU 4.6
  307. */
  308. U_STABLE void U_EXPORT2
  309. uregex_setUText(URegularExpression *regexp,
  310. UText *text,
  311. UErrorCode *status);
  312. /**
  313. * Get the subject text that is currently associated with this
  314. * regular expression object. If the input was supplied using uregex_setText(),
  315. * that pointer will be returned. Otherwise, the characters in the input will
  316. * be extracted to a buffer and returned. In either case, ownership remains
  317. * with the regular expression object.
  318. *
  319. * This function will work even if the input was originally specified as a UText.
  320. *
  321. * @param regexp The compiled regular expression.
  322. * @param textLength The length of the string is returned in this output parameter.
  323. * A NULL pointer may be used here if the
  324. * text length is not needed, as would be the case if
  325. * the text is known in advance to be a NUL terminated
  326. * string.
  327. * @param status Receives errors detected by this function.
  328. * @return Pointer to the subject text string currently associated with
  329. * this regular expression.
  330. * @stable ICU 3.0
  331. */
  332. U_STABLE const UChar * U_EXPORT2
  333. uregex_getText(URegularExpression *regexp,
  334. int32_t *textLength,
  335. UErrorCode *status);
  336. /**
  337. * Get the subject text that is currently associated with this
  338. * regular expression object.
  339. *
  340. * This function will work even if the input was originally specified as a UChar string.
  341. *
  342. * @param regexp The compiled regular expression.
  343. * @param dest A mutable UText in which to store the current input.
  344. * If NULL, a new UText will be created as an immutable shallow clone
  345. * of the actual input string.
  346. * @param status Receives errors detected by this function.
  347. * @return The subject text currently associated with this regular expression.
  348. * If a pre-allocated UText was provided, it will always be used and returned.
  349. *
  350. * @stable ICU 4.6
  351. */
  352. U_STABLE UText * U_EXPORT2
  353. uregex_getUText(URegularExpression *regexp,
  354. UText *dest,
  355. UErrorCode *status);
  356. /**
  357. * Set the subject text string upon which the regular expression is looking for matches
  358. * without changing any other aspect of the matching state.
  359. * The new and previous text strings must have the same content.
  360. *
  361. * This function is intended for use in environments where ICU is operating on
  362. * strings that may move around in memory. It provides a mechanism for notifying
  363. * ICU that the string has been relocated, and providing a new UText to access the
  364. * string in its new position.
  365. *
  366. * Note that the regular expression implementation never copies the underlying text
  367. * of a string being matched, but always operates directly on the original text
  368. * provided by the user. Refreshing simply drops the references to the old text
  369. * and replaces them with references to the new.
  370. *
  371. * Caution: this function is normally used only by very specialized
  372. * system-level code. One example use case is with garbage collection
  373. * that moves the text in memory.
  374. *
  375. * @param regexp The compiled regular expression.
  376. * @param text The new (moved) text string.
  377. * @param status Receives errors detected by this function.
  378. *
  379. * @stable ICU 4.8
  380. */
  381. U_STABLE void U_EXPORT2
  382. uregex_refreshUText(URegularExpression *regexp,
  383. UText *text,
  384. UErrorCode *status);
  385. /**
  386. * Attempts to match the input string against the pattern.
  387. * To succeed, the match must extend to the end of the string,
  388. * or cover the complete match region.
  389. *
  390. * If startIndex >= zero the match operation starts at the specified
  391. * index and must extend to the end of the input string. Any region
  392. * that has been specified is reset.
  393. *
  394. * If startIndex == -1 the match must cover the input region, or the entire
  395. * input string if no region has been set. This directly corresponds to
  396. * Matcher.matches() in Java
  397. *
  398. * @param regexp The compiled regular expression.
  399. * @param startIndex The input string (native) index at which to begin matching, or -1
  400. * to match the input Region.
  401. * @param status Receives errors detected by this function.
  402. * @return TRUE if there is a match
  403. * @stable ICU 3.0
  404. */
  405. U_STABLE UBool U_EXPORT2
  406. uregex_matches(URegularExpression *regexp,
  407. int32_t startIndex,
  408. UErrorCode *status);
  409. /**
  410. * 64bit version of uregex_matches.
  411. * Attempts to match the input string against the pattern.
  412. * To succeed, the match must extend to the end of the string,
  413. * or cover the complete match region.
  414. *
  415. * If startIndex >= zero the match operation starts at the specified
  416. * index and must extend to the end of the input string. Any region
  417. * that has been specified is reset.
  418. *
  419. * If startIndex == -1 the match must cover the input region, or the entire
  420. * input string if no region has been set. This directly corresponds to
  421. * Matcher.matches() in Java
  422. *
  423. * @param regexp The compiled regular expression.
  424. * @param startIndex The input string (native) index at which to begin matching, or -1
  425. * to match the input Region.
  426. * @param status Receives errors detected by this function.
  427. * @return TRUE if there is a match
  428. * @stable ICU 4.6
  429. */
  430. U_STABLE UBool U_EXPORT2
  431. uregex_matches64(URegularExpression *regexp,
  432. int64_t startIndex,
  433. UErrorCode *status);
  434. /**
  435. * Attempts to match the input string, starting from the specified index, against the pattern.
  436. * The match may be of any length, and is not required to extend to the end
  437. * of the input string. Contrast with uregex_matches().
  438. *
  439. * <p>If startIndex is >= 0 any input region that was set for this
  440. * URegularExpression is reset before the operation begins.
  441. *
  442. * <p>If the specified starting index == -1 the match begins at the start of the input
  443. * region, or at the start of the full string if no region has been specified.
  444. * This corresponds directly with Matcher.lookingAt() in Java.
  445. *
  446. * <p>If the match succeeds then more information can be obtained via the
  447. * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
  448. * and <code>uregexp_group()</code> functions.</p>
  449. *
  450. * @param regexp The compiled regular expression.
  451. * @param startIndex The input string (native) index at which to begin matching, or
  452. * -1 to match the Input Region
  453. * @param status A reference to a UErrorCode to receive any errors.
  454. * @return TRUE if there is a match.
  455. * @stable ICU 3.0
  456. */
  457. U_STABLE UBool U_EXPORT2
  458. uregex_lookingAt(URegularExpression *regexp,
  459. int32_t startIndex,
  460. UErrorCode *status);
  461. /**
  462. * 64bit version of uregex_lookingAt.
  463. * Attempts to match the input string, starting from the specified index, against the pattern.
  464. * The match may be of any length, and is not required to extend to the end
  465. * of the input string. Contrast with uregex_matches().
  466. *
  467. * <p>If startIndex is >= 0 any input region that was set for this
  468. * URegularExpression is reset before the operation begins.
  469. *
  470. * <p>If the specified starting index == -1 the match begins at the start of the input
  471. * region, or at the start of the full string if no region has been specified.
  472. * This corresponds directly with Matcher.lookingAt() in Java.
  473. *
  474. * <p>If the match succeeds then more information can be obtained via the
  475. * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
  476. * and <code>uregexp_group()</code> functions.</p>
  477. *
  478. * @param regexp The compiled regular expression.
  479. * @param startIndex The input string (native) index at which to begin matching, or
  480. * -1 to match the Input Region
  481. * @param status A reference to a UErrorCode to receive any errors.
  482. * @return TRUE if there is a match.
  483. * @stable ICU 4.6
  484. */
  485. U_STABLE UBool U_EXPORT2
  486. uregex_lookingAt64(URegularExpression *regexp,
  487. int64_t startIndex,
  488. UErrorCode *status);
  489. /**
  490. * Find the first matching substring of the input string that matches the pattern.
  491. * If startIndex is >= zero the search for a match begins at the specified index,
  492. * and any match region is reset. This corresponds directly with
  493. * Matcher.find(startIndex) in Java.
  494. *
  495. * If startIndex == -1 the search begins at the start of the input region,
  496. * or at the start of the full string if no region has been specified.
  497. *
  498. * If a match is found, <code>uregex_start(), uregex_end()</code>, and
  499. * <code>uregex_group()</code> will provide more information regarding the match.
  500. *
  501. * @param regexp The compiled regular expression.
  502. * @param startIndex The position (native) in the input string to begin the search, or
  503. * -1 to search within the Input Region.
  504. * @param status A reference to a UErrorCode to receive any errors.
  505. * @return TRUE if a match is found.
  506. * @stable ICU 3.0
  507. */
  508. U_STABLE UBool U_EXPORT2
  509. uregex_find(URegularExpression *regexp,
  510. int32_t startIndex,
  511. UErrorCode *status);
  512. /**
  513. * 64bit version of uregex_find.
  514. * Find the first matching substring of the input string that matches the pattern.
  515. * If startIndex is >= zero the search for a match begins at the specified index,
  516. * and any match region is reset. This corresponds directly with
  517. * Matcher.find(startIndex) in Java.
  518. *
  519. * If startIndex == -1 the search begins at the start of the input region,
  520. * or at the start of the full string if no region has been specified.
  521. *
  522. * If a match is found, <code>uregex_start(), uregex_end()</code>, and
  523. * <code>uregex_group()</code> will provide more information regarding the match.
  524. *
  525. * @param regexp The compiled regular expression.
  526. * @param startIndex The position (native) in the input string to begin the search, or
  527. * -1 to search within the Input Region.
  528. * @param status A reference to a UErrorCode to receive any errors.
  529. * @return TRUE if a match is found.
  530. * @stable ICU 4.6
  531. */
  532. U_STABLE UBool U_EXPORT2
  533. uregex_find64(URegularExpression *regexp,
  534. int64_t startIndex,
  535. UErrorCode *status);
  536. /**
  537. * Find the next pattern match in the input string. Begin searching
  538. * the input at the location following the end of he previous match,
  539. * or at the start of the string (or region) if there is no
  540. * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and
  541. * <code>uregex_group()</code> will provide more information regarding the match.
  542. *
  543. * @param regexp The compiled regular expression.
  544. * @param status A reference to a UErrorCode to receive any errors.
  545. * @return TRUE if a match is found.
  546. * @see uregex_reset
  547. * @stable ICU 3.0
  548. */
  549. U_STABLE UBool U_EXPORT2
  550. uregex_findNext(URegularExpression *regexp,
  551. UErrorCode *status);
  552. /**
  553. * Get the number of capturing groups in this regular expression's pattern.
  554. * @param regexp The compiled regular expression.
  555. * @param status A reference to a UErrorCode to receive any errors.
  556. * @return the number of capture groups
  557. * @stable ICU 3.0
  558. */
  559. U_STABLE int32_t U_EXPORT2
  560. uregex_groupCount(URegularExpression *regexp,
  561. UErrorCode *status);
  562. #ifndef U_HIDE_DRAFT_API
  563. /**
  564. * Get the group number corresponding to a named capture group.
  565. * The returned number can be used with any function that access
  566. * capture groups by number.
  567. *
  568. * The function returns an error status if the specified name does not
  569. * appear in the pattern.
  570. *
  571. * @param regexp The compiled regular expression.
  572. * @param groupName The capture group name.
  573. * @param nameLength The length of the name, or -1 if the name is a
  574. * nul-terminated string.
  575. * @param status A pointer to a UErrorCode to receive any errors.
  576. *
  577. * @draft ICU 55
  578. */
  579. U_DRAFT int32_t U_EXPORT2
  580. uregex_groupNumberFromName(URegularExpression *regexp,
  581. const UChar *groupName,
  582. int32_t nameLength,
  583. UErrorCode *status);
  584. /**
  585. * Get the group number corresponding to a named capture group.
  586. * The returned number can be used with any function that access
  587. * capture groups by number.
  588. *
  589. * The function returns an error status if the specified name does not
  590. * appear in the pattern.
  591. *
  592. * @param regexp The compiled regular expression.
  593. * @param groupName The capture group name,
  594. * platform invariant characters only.
  595. * @param nameLength The length of the name, or -1 if the name is
  596. * nul-terminated.
  597. * @param status A pointer to a UErrorCode to receive any errors.
  598. *
  599. * @draft ICU 55
  600. */
  601. U_DRAFT int32_t U_EXPORT2
  602. uregex_groupNumberFromCName(URegularExpression *regexp,
  603. const char *groupName,
  604. int32_t nameLength,
  605. UErrorCode *status);
  606. #endif /* U_HIDE_DRAFT_API */
  607. /** Extract the string for the specified matching expression or subexpression.
  608. * Group #0 is the complete string of matched text.
  609. * Group #1 is the text matched by the first set of capturing parentheses.
  610. *
  611. * @param regexp The compiled regular expression.
  612. * @param groupNum The capture group to extract. Group 0 is the complete
  613. * match. The value of this parameter must be
  614. * less than or equal to the number of capture groups in
  615. * the pattern.
  616. * @param dest Buffer to receive the matching string data
  617. * @param destCapacity Capacity of the dest buffer.
  618. * @param status A reference to a UErrorCode to receive any errors.
  619. * @return Length of matching data,
  620. * or -1 if no applicable match.
  621. * @stable ICU 3.0
  622. */
  623. U_STABLE int32_t U_EXPORT2
  624. uregex_group(URegularExpression *regexp,
  625. int32_t groupNum,
  626. UChar *dest,
  627. int32_t destCapacity,
  628. UErrorCode *status);
  629. /** Returns a shallow immutable clone of the entire input string with the current index set
  630. * to the beginning of the requested capture group. The capture group length is also
  631. * returned via groupLength.
  632. * Group #0 is the complete string of matched text.
  633. * Group #1 is the text matched by the first set of capturing parentheses.
  634. *
  635. * @param regexp The compiled regular expression.
  636. * @param groupNum The capture group to extract. Group 0 is the complete
  637. * match. The value of this parameter must be
  638. * less than or equal to the number of capture groups in
  639. * the pattern.
  640. * @param dest A mutable UText in which to store the current input.
  641. * If NULL, a new UText will be created as an immutable shallow clone
  642. * of the entire input string.
  643. * @param groupLength The group length of the desired capture group. Output parameter.
  644. * @param status A reference to a UErrorCode to receive any errors.
  645. * @return The subject text currently associated with this regular expression.
  646. * If a pre-allocated UText was provided, it will always be used and returned.
  647. *
  648. * @stable ICU 4.6
  649. */
  650. U_STABLE UText * U_EXPORT2
  651. uregex_groupUText(URegularExpression *regexp,
  652. int32_t groupNum,
  653. UText *dest,
  654. int64_t *groupLength,
  655. UErrorCode *status);
  656. /**
  657. * Returns the index in the input string of the start of the text matched by the
  658. * specified capture group during the previous match operation. Return -1 if
  659. * the capture group was not part of the last match.
  660. * Group #0 refers to the complete range of matched text.
  661. * Group #1 refers to the text matched by the first set of capturing parentheses.
  662. *
  663. * @param regexp The compiled regular expression.
  664. * @param groupNum The capture group number
  665. * @param status A reference to a UErrorCode to receive any errors.
  666. * @return the starting (native) position in the input of the text matched
  667. * by the specified group.
  668. * @stable ICU 3.0
  669. */
  670. U_STABLE int32_t U_EXPORT2
  671. uregex_start(URegularExpression *regexp,
  672. int32_t groupNum,
  673. UErrorCode *status);
  674. /**
  675. * 64bit version of uregex_start.
  676. * Returns the index in the input string of the start of the text matched by the
  677. * specified capture group during the previous match operation. Return -1 if
  678. * the capture group was not part of the last match.
  679. * Group #0 refers to the complete range of matched text.
  680. * Group #1 refers to the text matched by the first set of capturing parentheses.
  681. *
  682. * @param regexp The compiled regular expression.
  683. * @param groupNum The capture group number
  684. * @param status A reference to a UErrorCode to receive any errors.
  685. * @return the starting (native) position in the input of the text matched
  686. * by the specified group.
  687. * @stable ICU 4.6
  688. */
  689. U_STABLE int64_t U_EXPORT2
  690. uregex_start64(URegularExpression *regexp,
  691. int32_t groupNum,
  692. UErrorCode *status);
  693. /**
  694. * Returns the index in the input string of the position following the end
  695. * of the text matched by the specified capture group.
  696. * Return -1 if the capture group was not part of the last match.
  697. * Group #0 refers to the complete range of matched text.
  698. * Group #1 refers to the text matched by the first set of capturing parentheses.
  699. *
  700. * @param regexp The compiled regular expression.
  701. * @param groupNum The capture group number
  702. * @param status A reference to a UErrorCode to receive any errors.
  703. * @return the (native) index of the position following the last matched character.
  704. * @stable ICU 3.0
  705. */
  706. U_STABLE int32_t U_EXPORT2
  707. uregex_end(URegularExpression *regexp,
  708. int32_t groupNum,
  709. UErrorCode *status);
  710. /**
  711. * 64bit version of uregex_end.
  712. * Returns the index in the input string of the position following the end
  713. * of the text matched by the specified capture group.
  714. * Return -1 if the capture group was not part of the last match.
  715. * Group #0 refers to the complete range of matched text.
  716. * Group #1 refers to the text matched by the first set of capturing parentheses.
  717. *
  718. * @param regexp The compiled regular expression.
  719. * @param groupNum The capture group number
  720. * @param status A reference to a UErrorCode to receive any errors.
  721. * @return the (native) index of the position following the last matched character.
  722. * @stable ICU 4.6
  723. */
  724. U_STABLE int64_t U_EXPORT2
  725. uregex_end64(URegularExpression *regexp,
  726. int32_t groupNum,
  727. UErrorCode *status);
  728. /**
  729. * Reset any saved state from the previous match. Has the effect of
  730. * causing uregex_findNext to begin at the specified index, and causing
  731. * uregex_start(), uregex_end() and uregex_group() to return an error
  732. * indicating that there is no match information available. Clears any
  733. * match region that may have been set.
  734. *
  735. * @param regexp The compiled regular expression.
  736. * @param index The position (native) in the text at which a
  737. * uregex_findNext() should begin searching.
  738. * @param status A reference to a UErrorCode to receive any errors.
  739. * @stable ICU 3.0
  740. */
  741. U_STABLE void U_EXPORT2
  742. uregex_reset(URegularExpression *regexp,
  743. int32_t index,
  744. UErrorCode *status);
  745. /**
  746. * 64bit version of uregex_reset.
  747. * Reset any saved state from the previous match. Has the effect of
  748. * causing uregex_findNext to begin at the specified index, and causing
  749. * uregex_start(), uregex_end() and uregex_group() to return an error
  750. * indicating that there is no match information available. Clears any
  751. * match region that may have been set.
  752. *
  753. * @param regexp The compiled regular expression.
  754. * @param index The position (native) in the text at which a
  755. * uregex_findNext() should begin searching.
  756. * @param status A reference to a UErrorCode to receive any errors.
  757. * @stable ICU 4.6
  758. */
  759. U_STABLE void U_EXPORT2
  760. uregex_reset64(URegularExpression *regexp,
  761. int64_t index,
  762. UErrorCode *status);
  763. /**
  764. * Sets the limits of the matching region for this URegularExpression.
  765. * The region is the part of the input string that will be considered when matching.
  766. * Invoking this method resets any saved state from the previous match,
  767. * then sets the region to start at the index specified by the start parameter
  768. * and end at the index specified by the end parameter.
  769. *
  770. * Depending on the transparency and anchoring being used (see useTransparentBounds
  771. * and useAnchoringBounds), certain constructs such as anchors may behave differently
  772. * at or around the boundaries of the region
  773. *
  774. * The function will fail if start is greater than limit, or if either index
  775. * is less than zero or greater than the length of the string being matched.
  776. *
  777. * @param regexp The compiled regular expression.
  778. * @param regionStart The (native) index to begin searches at.
  779. * @param regionLimit The (native) index to end searches at (exclusive).
  780. * @param status A pointer to a UErrorCode to receive any errors.
  781. * @stable ICU 4.0
  782. */
  783. U_STABLE void U_EXPORT2
  784. uregex_setRegion(URegularExpression *regexp,
  785. int32_t regionStart,
  786. int32_t regionLimit,
  787. UErrorCode *status);
  788. /**
  789. * 64bit version of uregex_setRegion.
  790. * Sets the limits of the matching region for this URegularExpression.
  791. * The region is the part of the input string that will be considered when matching.
  792. * Invoking this method resets any saved state from the previous match,
  793. * then sets the region to start at the index specified by the start parameter
  794. * and end at the index specified by the end parameter.
  795. *
  796. * Depending on the transparency and anchoring being used (see useTransparentBounds
  797. * and useAnchoringBounds), certain constructs such as anchors may behave differently
  798. * at or around the boundaries of the region
  799. *
  800. * The function will fail if start is greater than limit, or if either index
  801. * is less than zero or greater than the length of the string being matched.
  802. *
  803. * @param regexp The compiled regular expression.
  804. * @param regionStart The (native) index to begin searches at.
  805. * @param regionLimit The (native) index to end searches at (exclusive).
  806. * @param status A pointer to a UErrorCode to receive any errors.
  807. * @stable ICU 4.6
  808. */
  809. U_STABLE void U_EXPORT2
  810. uregex_setRegion64(URegularExpression *regexp,
  811. int64_t regionStart,
  812. int64_t regionLimit,
  813. UErrorCode *status);
  814. /**
  815. * Set the matching region and the starting index for subsequent matches
  816. * in a single operation.
  817. * This is useful because the usual function for setting the starting
  818. * index, urgex_reset(), also resets any region limits.
  819. *
  820. * @param regexp The compiled regular expression.
  821. * @param regionStart The (native) index to begin searches at.
  822. * @param regionLimit The (native) index to end searches at (exclusive).
  823. * @param startIndex The index in the input text at which the next
  824. * match operation should begin.
  825. * @param status A pointer to a UErrorCode to receive any errors.
  826. * @stable ICU 4.6
  827. */
  828. U_STABLE void U_EXPORT2
  829. uregex_setRegionAndStart(URegularExpression *regexp,
  830. int64_t regionStart,
  831. int64_t regionLimit,
  832. int64_t startIndex,
  833. UErrorCode *status);
  834. /**
  835. * Reports the start index of the matching region. Any matches found are limited to
  836. * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
  837. *
  838. * @param regexp The compiled regular expression.
  839. * @param status A pointer to a UErrorCode to receive any errors.
  840. * @return The starting (native) index of this matcher's region.
  841. * @stable ICU 4.0
  842. */
  843. U_STABLE int32_t U_EXPORT2
  844. uregex_regionStart(const URegularExpression *regexp,
  845. UErrorCode *status);
  846. /**
  847. * 64bit version of uregex_regionStart.
  848. * Reports the start index of the matching region. Any matches found are limited to
  849. * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
  850. *
  851. * @param regexp The compiled regular expression.
  852. * @param status A pointer to a UErrorCode to receive any errors.
  853. * @return The starting (native) index of this matcher's region.
  854. * @stable ICU 4.6
  855. */
  856. U_STABLE int64_t U_EXPORT2
  857. uregex_regionStart64(const URegularExpression *regexp,
  858. UErrorCode *status);
  859. /**
  860. * Reports the end index (exclusive) of the matching region for this URegularExpression.
  861. * Any matches found are limited to to the region bounded by regionStart (inclusive)
  862. * and regionEnd (exclusive).
  863. *
  864. * @param regexp The compiled regular expression.
  865. * @param status A pointer to a UErrorCode to receive any errors.
  866. * @return The ending point (native) of this matcher's region.
  867. * @stable ICU 4.0
  868. */
  869. U_STABLE int32_t U_EXPORT2
  870. uregex_regionEnd(const URegularExpression *regexp,
  871. UErrorCode *status);
  872. /**
  873. * 64bit version of uregex_regionEnd.
  874. * Reports the end index (exclusive) of the matching region for this URegularExpression.
  875. * Any matches found are limited to to the region bounded by regionStart (inclusive)
  876. * and regionEnd (exclusive).
  877. *
  878. * @param regexp The compiled regular expression.
  879. * @param status A pointer to a UErrorCode to receive any errors.
  880. * @return The ending point (native) of this matcher's region.
  881. * @stable ICU 4.6
  882. */
  883. U_STABLE int64_t U_EXPORT2
  884. uregex_regionEnd64(const URegularExpression *regexp,
  885. UErrorCode *status);
  886. /**
  887. * Queries the transparency of region bounds for this URegularExpression.
  888. * See useTransparentBounds for a description of transparent and opaque bounds.
  889. * By default, matching boundaries are opaque.
  890. *
  891. * @param regexp The compiled regular expression.
  892. * @param status A pointer to a UErrorCode to receive any errors.
  893. * @return TRUE if this matcher is using opaque bounds, false if it is not.
  894. * @stable ICU 4.0
  895. */
  896. U_STABLE UBool U_EXPORT2
  897. uregex_hasTransparentBounds(const URegularExpression *regexp,
  898. UErrorCode *status);
  899. /**
  900. * Sets the transparency of region bounds for this URegularExpression.
  901. * Invoking this function with an argument of TRUE will set matches to use transparent bounds.
  902. * If the boolean argument is FALSE, then opaque bounds will be used.
  903. *
  904. * Using transparent bounds, the boundaries of the matching region are transparent
  905. * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
  906. * see text beyond the boundaries of the region while checking for a match.
  907. *
  908. * With opaque bounds, no text outside of the matching region is visible to lookahead,
  909. * lookbehind, and boundary matching constructs.
  910. *
  911. * By default, opaque bounds are used.
  912. *
  913. * @param regexp The compiled regular expression.
  914. * @param b TRUE for transparent bounds; FALSE for opaque bounds
  915. * @param status A pointer to a UErrorCode to receive any errors.
  916. * @stable ICU 4.0
  917. **/
  918. U_STABLE void U_EXPORT2
  919. uregex_useTransparentBounds(URegularExpression *regexp,
  920. UBool b,
  921. UErrorCode *status);
  922. /**
  923. * Return true if this URegularExpression is using anchoring bounds.
  924. * By default, anchoring region bounds are used.
  925. *
  926. * @param regexp The compiled regular expression.
  927. * @param status A pointer to a UErrorCode to receive any errors.
  928. * @return TRUE if this matcher is using anchoring bounds.
  929. * @stable ICU 4.0
  930. */
  931. U_STABLE UBool U_EXPORT2
  932. uregex_hasAnchoringBounds(const URegularExpression *regexp,
  933. UErrorCode *status);
  934. /**
  935. * Set whether this URegularExpression is using Anchoring Bounds for its region.
  936. * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
  937. * and end of the region. Without Anchoring Bounds, anchors will only match at
  938. * the positions they would in the complete text.
  939. *
  940. * Anchoring Bounds are the default for regions.
  941. *
  942. * @param regexp The compiled regular expression.
  943. * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
  944. * @param status A pointer to a UErrorCode to receive any errors.
  945. * @stable ICU 4.0
  946. */
  947. U_STABLE void U_EXPORT2
  948. uregex_useAnchoringBounds(URegularExpression *regexp,
  949. UBool b,
  950. UErrorCode *status);
  951. /**
  952. * Return TRUE if the most recent matching operation touched the
  953. * end of the text being processed. In this case, additional input text could
  954. * change the results of that match.
  955. *
  956. * @param regexp The compiled regular expression.
  957. * @param status A pointer to a UErrorCode to receive any errors.
  958. * @return TRUE if the most recent match hit the end of input
  959. * @stable ICU 4.0
  960. */
  961. U_STABLE UBool U_EXPORT2
  962. uregex_hitEnd(const URegularExpression *regexp,
  963. UErrorCode *status);
  964. /**
  965. * Return TRUE the most recent match succeeded and additional input could cause
  966. * it to fail. If this function returns false and a match was found, then more input
  967. * might change the match but the match won't be lost. If a match was not found,
  968. * then requireEnd has no meaning.
  969. *
  970. * @param regexp The compiled regular expression.
  971. * @param status A pointer to a UErrorCode to receive any errors.
  972. * @return TRUE if more input could cause the most recent match to no longer match.
  973. * @stable ICU 4.0
  974. */
  975. U_STABLE UBool U_EXPORT2
  976. uregex_requireEnd(const URegularExpression *regexp,
  977. UErrorCode *status);
  978. /**
  979. * Replaces every substring of the input that matches the pattern
  980. * with the given replacement string. This is a convenience function that
  981. * provides a complete find-and-replace-all operation.
  982. *
  983. * This method scans the input string looking for matches of the pattern.
  984. * Input that is not part of any match is copied unchanged to the
  985. * destination buffer. Matched regions are replaced in the output
  986. * buffer by the replacement string. The replacement string may contain
  987. * references to capture groups; these take the form of $1, $2, etc.
  988. *
  989. * @param regexp The compiled regular expression.
  990. * @param replacementText A string containing the replacement text.
  991. * @param replacementLength The length of the replacement string, or
  992. * -1 if it is NUL terminated.
  993. * @param destBuf A (UChar *) buffer that will receive the result.
  994. * @param destCapacity The capacity of the destination buffer.
  995. * @param status A reference to a UErrorCode to receive any errors.
  996. * @return The length of the string resulting from the find
  997. * and replace operation. In the event that the
  998. * destination capacity is inadequate, the return value
  999. * is still the full length of the untruncated string.
  1000. * @stable ICU 3.0
  1001. */
  1002. U_STABLE int32_t U_EXPORT2
  1003. uregex_replaceAll(URegularExpression *regexp,
  1004. const UChar *replacementText,
  1005. int32_t replacementLength,
  1006. UChar *destBuf,
  1007. int32_t destCapacity,
  1008. UErrorCode *status);
  1009. /**
  1010. * Replaces every substring of the input that matches the pattern
  1011. * with the given replacement string. This is a convenience function that
  1012. * provides a complete find-and-replace-all operation.
  1013. *
  1014. * This method scans the input string looking for matches of the pattern.
  1015. * Input that is not part of any match is copied unchanged to the
  1016. * destination buffer. Matched regions are replaced in the output
  1017. * buffer by the replacement string. The replacement string may contain
  1018. * references to capture groups; these take the form of $1, $2, etc.
  1019. *
  1020. * @param regexp The compiled regular expression.
  1021. * @param replacement A string containing the replacement text.
  1022. * @param dest A mutable UText that will receive the result.
  1023. * If NULL, a new UText will be created (which may not be mutable).
  1024. * @param status A reference to a UErrorCode to receive any errors.
  1025. * @return A UText containing the results of the find and replace.
  1026. * If a pre-allocated UText was provided, it will always be used and returned.
  1027. *
  1028. * @stable ICU 4.6
  1029. */
  1030. U_STABLE UText * U_EXPORT2
  1031. uregex_replaceAllUText(URegularExpression *regexp,
  1032. UText *replacement,
  1033. UText *dest,
  1034. UErrorCode *status);
  1035. /**
  1036. * Replaces the first substring of the input that matches the pattern
  1037. * with the given replacement string. This is a convenience function that
  1038. * provides a complete find-and-replace operation.
  1039. *
  1040. * This method scans the input string looking for a match of the pattern.
  1041. * All input that is not part of the match is copied unchanged to the
  1042. * destination buffer. The matched region is replaced in the output
  1043. * buffer by the replacement string. The replacement string may contain
  1044. * references to capture groups; these take the form of $1, $2, etc.
  1045. *
  1046. * @param regexp The compiled regular expression.
  1047. * @param replacementText A string containing the replacement text.
  1048. * @param replacementLength The length of the replacement string, or
  1049. * -1 if it is NUL terminated.
  1050. * @param destBuf A (UChar *) buffer that will receive the result.
  1051. * @param destCapacity The capacity of the destination buffer.
  1052. * @param status a reference to a UErrorCode to receive any errors.
  1053. * @return The length of the string resulting from the find
  1054. * and replace operation. In the event that the
  1055. * destination capacity is inadequate, the return value
  1056. * is still the full length of the untruncated string.
  1057. * @stable ICU 3.0
  1058. */
  1059. U_STABLE int32_t U_EXPORT2
  1060. uregex_replaceFirst(URegularExpression *regexp,
  1061. const UChar *replacementText,
  1062. int32_t replacementLength,
  1063. UChar *destBuf,
  1064. int32_t destCapacity,
  1065. UErrorCode *status);
  1066. /**
  1067. * Replaces the first substring of the input that matches the pattern
  1068. * with the given replacement string. This is a convenience function that
  1069. * provides a complete find-and-replace operation.
  1070. *
  1071. * This method scans the input string looking for a match of the pattern.
  1072. * All input that is not part of the match is copied unchanged to the
  1073. * destination buffer. The matched region is replaced in the output
  1074. * buffer by the replacement string. The replacement string may contain
  1075. * references to capture groups; these take the form of $1, $2, etc.
  1076. *
  1077. * @param regexp The compiled regular expression.
  1078. * @param replacement A string containing the replacement text.
  1079. * @param dest A mutable UText that will receive the result.
  1080. * If NULL, a new UText will be created (which may not be mutable).
  1081. * @param status A reference to a UErrorCode to receive any errors.
  1082. * @return A UText containing the results of the find and replace.
  1083. * If a pre-allocated UText was provided, it will always be used and returned.
  1084. *
  1085. * @stable ICU 4.6
  1086. */
  1087. U_STABLE UText * U_EXPORT2
  1088. uregex_replaceFirstUText(URegularExpression *regexp,
  1089. UText *replacement,
  1090. UText *dest,
  1091. UErrorCode *status);
  1092. /**
  1093. * Implements a replace operation intended to be used as part of an
  1094. * incremental find-and-replace.
  1095. *
  1096. * <p>The input string, starting from the end of the previous match and ending at
  1097. * the start of the current match, is appended to the destination string. Then the
  1098. * replacement string is appended to the output string,
  1099. * including handling any substitutions of captured text.</p>
  1100. *
  1101. * <p>A note on preflight computation of buffersize and error handling:
  1102. * Calls to uregex_appendReplacement() and uregex_appendTail() are
  1103. * designed to be chained, one after another, with the destination
  1104. * buffer pointer and buffer capacity updated after each in preparation
  1105. * to for the next. If the destination buffer is exhausted partway through such a
  1106. * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal
  1107. * ICU conventions are for a function to perform no action if it is
  1108. * called with an error status, but for this one case, uregex_appendRepacement()
  1109. * will operate normally so that buffer size computations will complete
  1110. * correctly.
  1111. *
  1112. * <p>For simple, prepackaged, non-incremental find-and-replace
  1113. * operations, see replaceFirst() or replaceAll().</p>
  1114. *
  1115. * @param regexp The regular expression object.
  1116. * @param replacementText The string that will replace the matched portion of the
  1117. * input string as it is copied to the destination buffer.
  1118. * The replacement text may contain references ($1, for
  1119. * example) to capture groups from the match.
  1120. * @param replacementLength The length of the replacement text string,
  1121. * or -1 if the string is NUL terminated.
  1122. * @param destBuf The buffer into which the results of the
  1123. * find-and-replace are placed. On return, this pointer
  1124. * will be updated to refer to the beginning of the
  1125. * unused portion of buffer, leaving it in position for
  1126. * a subsequent call to this function.
  1127. * @param destCapacity The size of the output buffer, On return, this
  1128. * parameter will be updated to reflect the space remaining
  1129. * unused in the output buffer.
  1130. * @param status A reference to a UErrorCode to receive any errors.
  1131. * @return The length of the result string. In the event that
  1132. * destCapacity is inadequate, the full length of the
  1133. * untruncated output string is returned.
  1134. *
  1135. * @stable ICU 3.0
  1136. *
  1137. */
  1138. U_STABLE int32_t U_EXPORT2
  1139. uregex_appendReplacement(URegularExpression *regexp,
  1140. const UChar *replacementText,
  1141. int32_t replacementLength,
  1142. UChar **destBuf,
  1143. int32_t *destCapacity,
  1144. UErrorCode *status);
  1145. /**
  1146. * Implements a replace operation intended to be used as part of an
  1147. * incremental find-and-replace.
  1148. *
  1149. * <p>The input string, starting from the end of the previous match and ending at
  1150. * the start of the current match, is appended to the destination string. Then the
  1151. * replacement string is appended to the output string,
  1152. * including handling any substitutions of captured text.</p>
  1153. *
  1154. * <p>For simple, prepackaged, non-incremental find-and-replace
  1155. * operations, see replaceFirst() or replaceAll().</p>
  1156. *
  1157. * @param regexp The regular expression object.
  1158. * @param replacementText The string that will replace the matched portion of the
  1159. * input string as it is copied to the destination buffer.
  1160. * The replacement text may contain references ($1, for
  1161. * example) to capture groups from the match.
  1162. * @param dest A mutable UText that will receive the result. Must not be NULL.
  1163. * @param status A reference to a UErrorCode to receive any errors.
  1164. *
  1165. * @stable ICU 4.6
  1166. */
  1167. U_STABLE void U_EXPORT2
  1168. uregex_appendReplacementUText(URegularExpression *regexp,
  1169. UText *replacementText,
  1170. UText *dest,
  1171. UErrorCode *status);
  1172. /**
  1173. * As the final step in a find-and-replace operation, append the remainder
  1174. * of the input string, starting at the position following the last match,
  1175. * to the destination string. <code>uregex_appendTail()</code> is intended
  1176. * to be invoked after one or more invocations of the
  1177. * <code>uregex_appendReplacement()</code> function.
  1178. *
  1179. * @param regexp The regular expression object. This is needed to
  1180. * obtain the input string and with the position
  1181. * of the last match within it.
  1182. * @param destBuf The buffer in which the results of the
  1183. * find-and-replace are placed. On return, the pointer
  1184. * will be updated to refer to the beginning of the
  1185. * unused portion of buffer.
  1186. * @param destCapacity The size of the output buffer, On return, this
  1187. * value will be updated to reflect the space remaining
  1188. * unused in the output buffer.
  1189. * @param status A reference to a UErrorCode to receive any errors.
  1190. * @return The length of the result string. In the event that
  1191. * destCapacity is inadequate, the full length of the
  1192. * untruncated output string is returned.
  1193. *
  1194. * @stable ICU 3.0
  1195. */
  1196. U_STABLE int32_t U_EXPORT2
  1197. uregex_appendTail(URegularExpression *regexp,
  1198. UChar **destBuf,
  1199. int32_t *destCapacity,
  1200. UErrorCode *status);
  1201. /**
  1202. * As the final step in a find-and-replace operation, append the remainder
  1203. * of the input string, starting at the position following the last match,
  1204. * to the destination string. <code>uregex_appendTailUText()</code> is intended
  1205. * to be invoked after one or more invocations of the
  1206. * <code>uregex_appendReplacementUText()</code> function.
  1207. *
  1208. * @param regexp The regular expression object. This is needed to
  1209. * obtain the input string and with the position
  1210. * of the last match within it.
  1211. * @param dest A mutable UText that will receive the result. Must not be NULL.
  1212. *
  1213. * @param status Error code
  1214. *
  1215. * @return The destination UText.
  1216. *
  1217. * @stable ICU 4.6
  1218. */
  1219. U_STABLE UText * U_EXPORT2
  1220. uregex_appendTailUText(URegularExpression *regexp,
  1221. UText *dest,
  1222. UErrorCode *status);
  1223. /**
  1224. * Split a string into fields. Somewhat like split() from Perl.
  1225. * The pattern matches identify delimiters that separate the input
  1226. * into fields. The input data between the matches becomes the
  1227. * fields themselves.
  1228. *
  1229. * Each of the fields is copied from the input string to the destination
  1230. * buffer, and NUL terminated. The position of each field within
  1231. * the destination buffer is returned in the destFields array.
  1232. *
  1233. * If the delimiter pattern includes capture groups, the captured text will
  1234. * also appear in the destination array of output strings, interspersed
  1235. * with the fields. This is similar to Perl, but differs from Java,
  1236. * which ignores the presence of capture groups in the pattern.
  1237. *
  1238. * Trailing empty fields will always be returned, assuming sufficient
  1239. * destination capacity. This differs from the default behavior for Java
  1240. * and Perl where trailing empty fields are not returned.
  1241. *
  1242. * The number of strings produced by the split operation is returned.
  1243. * This count includes the strings from capture groups in the delimiter pattern.
  1244. * This behavior differs from Java, which ignores capture groups.
  1245. *
  1246. * @param regexp The compiled regular expression.
  1247. * @param destBuf A (UChar *) buffer to receive the fields that
  1248. * are extracted from the input string. These
  1249. * field pointers will refer to positions within the
  1250. * destination buffer supplied by the caller. Any
  1251. * extra positions within the destFields array will be
  1252. * set to NULL.
  1253. * @param destCapacity The capacity of the destBuf.
  1254. * @param requiredCapacity The actual capacity required of the destBuf.
  1255. * If destCapacity is too small, requiredCapacity will return
  1256. * the total capacity required to hold all of the output, and
  1257. * a U_BUFFER_OVERFLOW_ERROR will be returned.
  1258. * @param destFields An array to be filled with the position of each
  1259. * of the extracted fields within destBuf.
  1260. * @param destFieldsCapacity The number of elements in the destFields array.
  1261. * If the number of fields found is less than destFieldsCapacity,
  1262. * the extra destFields elements are set to zero.
  1263. * If destFieldsCapacity is too small, the trailing part of the
  1264. * input, including any field delimiters, is treated as if it
  1265. * were the last field - it is copied to the destBuf, and
  1266. * its position is in the destBuf is stored in the last element
  1267. * of destFields. This behavior mimics that of Perl. It is not
  1268. * an error condition, and no error status is returned when all destField
  1269. * positions are used.
  1270. * @param status A reference to a UErrorCode to receive any errors.
  1271. * @return The number of fields into which the input string was split.
  1272. * @stable ICU 3.0
  1273. */
  1274. U_STABLE int32_t U_EXPORT2
  1275. uregex_split( URegularExpression *regexp,
  1276. UChar *destBuf,
  1277. int32_t destCapacity,
  1278. int32_t *requiredCapacity,
  1279. UChar *destFields[],
  1280. int32_t destFieldsCapacity,
  1281. UErrorCode *status);
  1282. /**
  1283. * Split a string into fields. Somewhat like split() from Perl.
  1284. * The pattern matches identify delimiters that separate the input
  1285. * into fields. The input data between the matches becomes the
  1286. * fields themselves.
  1287. * <p>
  1288. * The behavior of this function is not very closely aligned with uregex_split();
  1289. * instead, it is based on (and implemented directly on top of) the C++ split method.
  1290. *
  1291. * @param regexp The compiled regular expression.
  1292. * @param destFields An array of mutable UText structs to receive the results of the split.
  1293. * If a field is NULL, a new UText is allocated to contain the results for
  1294. * that field. This new UText is not guaranteed to be mutable.
  1295. * @param destFieldsCapacity The number of elements in the destination array.
  1296. * If the number of fields found is less than destCapacity, the
  1297. * extra strings in the destination array are not altered.
  1298. * If the number of destination strings is less than the number
  1299. * of fields, the trailing part of the input string, including any
  1300. * field delimiters, is placed in the last destination string.
  1301. * This behavior mimics that of Perl. It is not an error condition, and no
  1302. * error status is returned when all destField positions are used.
  1303. * @param status A reference to a UErrorCode to receive any errors.
  1304. * @return The number of fields into which the input string was split.
  1305. *
  1306. * @stable ICU 4.6
  1307. */
  1308. U_STABLE int32_t U_EXPORT2
  1309. uregex_splitUText(URegularExpression *regexp,
  1310. UText *destFields[],
  1311. int32_t destFieldsCapacity,
  1312. UErrorCode *status);
  1313. /**
  1314. * Set a processing time limit for match operations with this URegularExpression.
  1315. *
  1316. * Some patterns, when matching certain strings, can run in exponential time.
  1317. * For practical purposes, the match operation may appear to be in an
  1318. * infinite loop.
  1319. * When a limit is set a match operation will fail with an error if the
  1320. * limit is exceeded.
  1321. * <p>
  1322. * The units of the limit are steps of the match engine.
  1323. * Correspondence with actual processor time will depend on the speed
  1324. * of the processor and the details of the specific pattern, but will
  1325. * typically be on the order of milliseconds.
  1326. * <p>
  1327. * By default, the matching time is not limited.
  1328. * <p>
  1329. *
  1330. * @param regexp The compiled regular expression.
  1331. * @param limit The limit value, or 0 for no limit.
  1332. * @param status A reference to a UErrorCode to receive any errors.
  1333. * @stable ICU 4.0
  1334. */
  1335. U_STABLE void U_EXPORT2
  1336. uregex_setTimeLimit(URegularExpression *regexp,
  1337. int32_t limit,
  1338. UErrorCode *status);
  1339. /**
  1340. * Get the time limit for for matches with this URegularExpression.
  1341. * A return value of zero indicates that there is no limit.
  1342. *
  1343. * @param regexp The compiled regular expression.
  1344. * @param status A reference to a UErrorCode to receive any errors.
  1345. * @return the maximum allowed time for a match, in units of processing steps.
  1346. * @stable ICU 4.0
  1347. */
  1348. U_STABLE int32_t U_EXPORT2
  1349. uregex_getTimeLimit(const URegularExpression *regexp,
  1350. UErrorCode *status);
  1351. /**
  1352. * Set the amount of heap storage available for use by the match backtracking stack.
  1353. * <p>
  1354. * ICU uses a backtracking regular expression engine, with the backtrack stack
  1355. * maintained on the heap. This function sets the limit to the amount of memory
  1356. * that can be used for this purpose. A backtracking stack overflow will
  1357. * result in an error from the match operation that caused it.
  1358. * <p>
  1359. * A limit is desirable because a malicious or poorly designed pattern can use
  1360. * excessive memory, potentially crashing the process. A limit is enabled
  1361. * by default.
  1362. * <p>
  1363. * @param regexp The compiled regular expression.
  1364. * @param limit The maximum size, in bytes, of the matching backtrack stack.
  1365. * A value of zero means no limit.
  1366. * The limit must be greater than or equal to zero.
  1367. * @param status A reference to a UErrorCode to receive any errors.
  1368. *
  1369. * @stable ICU 4.0
  1370. */
  1371. U_STABLE void U_EXPORT2
  1372. uregex_setStackLimit(URegularExpression *regexp,
  1373. int32_t limit,
  1374. UErrorCode *status);
  1375. /**
  1376. * Get the size of the heap storage available for use by the back tracking stack.
  1377. *
  1378. * @return the maximum backtracking stack size, in bytes, or zero if the
  1379. * stack size is unlimited.
  1380. * @stable ICU 4.0
  1381. */
  1382. U_STABLE int32_t U_EXPORT2
  1383. uregex_getStackLimit(const URegularExpression *regexp,
  1384. UErrorCode *status);
  1385. /**
  1386. * Function pointer for a regular expression matching callback function.
  1387. * When set, a callback function will be called periodically during matching
  1388. * operations. If the call back function returns FALSE, the matching
  1389. * operation will be terminated early.
  1390. *
  1391. * Note: the callback function must not call other functions on this
  1392. * URegularExpression.
  1393. *
  1394. * @param context context pointer. The callback function will be invoked
  1395. * with the context specified at the time that
  1396. * uregex_setMatchCallback() is called.
  1397. * @param steps the accumulated processing time, in match steps,
  1398. * for this matching operation.
  1399. * @return TRUE to continue the matching operation.
  1400. * FALSE to terminate the matching operation.
  1401. * @stable ICU 4.0
  1402. */
  1403. U_CDECL_BEGIN
  1404. typedef UBool U_CALLCONV URegexMatchCallback (
  1405. const void *context,
  1406. int32_t steps);
  1407. U_CDECL_END
  1408. /**
  1409. * Set a callback function for this URegularExpression.
  1410. * During matching operations the function will be called periodically,
  1411. * giving the application the opportunity to terminate a long-running
  1412. * match.
  1413. *
  1414. * @param regexp The compiled regular expression.
  1415. * @param callback A pointer to the user-supplied callback function.
  1416. * @param context User context pointer. The value supplied at the
  1417. * time the callback function is set will be saved
  1418. * and passed to the callback each time that it is called.
  1419. * @param status A reference to a UErrorCode to receive any errors.
  1420. * @stable ICU 4.0
  1421. */
  1422. U_STABLE void U_EXPORT2
  1423. uregex_setMatchCallback(URegularExpression *regexp,
  1424. URegexMatchCallback *callback,
  1425. const void *context,
  1426. UErrorCode *status);
  1427. /**
  1428. * Get the callback function for this URegularExpression.
  1429. *
  1430. * @param regexp The compiled regular expression.
  1431. * @param callback Out parameter, receives a pointer to the user-supplied
  1432. * callback function.
  1433. * @param context Out parameter, receives the user context pointer that
  1434. * was set when uregex_setMatchCallback() was called.
  1435. * @param status A reference to a UErrorCode to receive any errors.
  1436. * @stable ICU 4.0
  1437. */
  1438. U_STABLE void U_EXPORT2
  1439. uregex_getMatchCallback(const URegularExpression *regexp,
  1440. URegexMatchCallback **callback,
  1441. const void **context,
  1442. UErrorCode *status);
  1443. /**
  1444. * Function pointer for a regular expression find callback function.
  1445. *
  1446. * When set, a callback function will be called during a find operation
  1447. * and for operations that depend on find, such as findNext, split and some replace
  1448. * operations like replaceFirst.
  1449. * The callback will usually be called after each attempt at a match, but this is not a
  1450. * guarantee that the callback will be invoked at each character. For finds where the
  1451. * match engine is invoked at each character, this may be close to true, but less likely
  1452. * for more optimized loops where the pattern is known to only start, and the match
  1453. * engine invoked, at certain characters.
  1454. * When invoked, this callback will specify the index at which a match operation is about
  1455. * to be attempted, giving the application the opportunity to terminate a long-running
  1456. * find operation.
  1457. *
  1458. * If the call back function returns FALSE, the find operation will be terminated early.
  1459. *
  1460. * Note: the callback function must not call other functions on this
  1461. * URegularExpression
  1462. *
  1463. * @param context context pointer. The callback function will be invoked
  1464. * with the context specified at the time that
  1465. * uregex_setFindProgressCallback() is called.
  1466. * @param matchIndex the next index at which a match attempt will be attempted for this
  1467. * find operation. If this callback interrupts the search, this is the
  1468. * index at which a find/findNext operation may be re-initiated.
  1469. * @return TRUE to continue the matching operation.
  1470. * FALSE to terminate the matching operation.
  1471. * @stable ICU 4.6
  1472. */
  1473. U_CDECL_BEGIN
  1474. typedef UBool U_CALLCONV URegexFindProgressCallback (
  1475. const void *context,
  1476. int64_t matchIndex);
  1477. U_CDECL_END
  1478. /**
  1479. * Set the find progress callback function for this URegularExpression.
  1480. *
  1481. * @param regexp The compiled regular expression.
  1482. * @param callback A pointer to the user-supplied callback function.
  1483. * @param context User context pointer. The value supplied at the
  1484. * time the callback function is set will be saved
  1485. * and passed to the callback each time that it is called.
  1486. * @param status A reference to a UErrorCode to receive any errors.
  1487. * @stable ICU 4.6
  1488. */
  1489. U_STABLE void U_EXPORT2
  1490. uregex_setFindProgressCallback(URegularExpression *regexp,
  1491. URegexFindProgressCallback *callback,
  1492. const void *context,
  1493. UErrorCode *status);
  1494. /**
  1495. * Get the find progress callback function for this URegularExpression.
  1496. *
  1497. * @param regexp The compiled regular expression.
  1498. * @param callback Out parameter, receives a pointer to the user-supplied
  1499. * callback function.
  1500. * @param context Out parameter, receives the user context pointer that
  1501. * was set when uregex_setFindProgressCallback() was called.
  1502. * @param status A reference to a UErrorCode to receive any errors.
  1503. * @stable ICU 4.6
  1504. */
  1505. U_STABLE void U_EXPORT2
  1506. uregex_getFindProgressCallback(const URegularExpression *regexp,
  1507. URegexFindProgressCallback **callback,
  1508. const void **context,
  1509. UErrorCode *status);
  1510. #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
  1511. #endif /* UREGEX_H */