wildcard.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. /*
  2. * Wildcard matching engine for use with SFTP-based file transfer
  3. * programs (PSFTP, new-look PSCP): since SFTP has no notion of
  4. * getting the remote side to do globbing (and rightly so) we have
  5. * to do it locally, by retrieving all the filenames in a directory
  6. * and checking each against the wildcard pattern.
  7. */
  8. #include <assert.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11. /* MPEXT: This makes compilation from command-line hang */
  12. #ifndef MPEXT
  13. #include "putty.h"
  14. #endif
  15. /*
  16. * Definition of wildcard syntax:
  17. *
  18. * - * matches any sequence of characters, including zero.
  19. * - ? matches exactly one character which can be anything.
  20. * - [abc] matches exactly one character which is a, b or c.
  21. * - [a-f] matches anything from a through f.
  22. * - [^a-f] matches anything _except_ a through f.
  23. * - [-_] matches - or _; [^-_] matches anything else. (The - is
  24. * non-special if it occurs immediately after the opening
  25. * bracket or ^.)
  26. * - [a^] matches an a or a ^. (The ^ is non-special if it does
  27. * _not_ occur immediately after the opening bracket.)
  28. * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \.
  29. * - All other characters are non-special and match themselves.
  30. */
  31. /*
  32. * Some notes on differences from POSIX globs (IEEE Std 1003.1, 2003 ed.):
  33. * - backslashes act as escapes even within [] bracket expressions
  34. * - does not support [!...] for non-matching list (POSIX are weird);
  35. * NB POSIX allows [^...] as well via "A bracket expression starting
  36. * with an unquoted circumflex character produces unspecified
  37. * results". If we wanted to allow [!...] we might want to define
  38. * [^!] as having its literal meaning (match '^' or '!').
  39. * - none of the scary [[:class:]] stuff, etc
  40. */
  41. /*
  42. * The wildcard matching technique we use is very simple and
  43. * potentially O(N^2) in running time, but I don't anticipate it
  44. * being that bad in reality (particularly since N will be the size
  45. * of a filename, which isn't all that much). Perhaps one day, once
  46. * PuTTY has grown a regexp matcher for some other reason, I might
  47. * come back and reimplement wildcards by translating them into
  48. * regexps or directly into NFAs; but for the moment, in the
  49. * absence of any other need for the NFA->DFA translation engine,
  50. * anything more than the simplest possible wildcard matcher is
  51. * vast code-size overkill.
  52. *
  53. * Essentially, these wildcards are much simpler than regexps in
  54. * that they consist of a sequence of rigid fragments (? and [...]
  55. * can never match more or less than one character) separated by
  56. * asterisks. It is therefore extremely simple to look at a rigid
  57. * fragment and determine whether or not it begins at a particular
  58. * point in the test string; so we can search along the string
  59. * until we find each fragment, then search for the next. As long
  60. * as we find each fragment in the _first_ place it occurs, there
  61. * will never be a danger of having to backpedal and try to find it
  62. * again somewhere else.
  63. */
  64. enum {
  65. WC_TRAILINGBACKSLASH = 1,
  66. WC_UNCLOSEDCLASS,
  67. WC_INVALIDRANGE
  68. };
  69. /*
  70. * Error reporting is done by returning various negative values
  71. * from the wildcard routines. Passing any such value to wc_error
  72. * will give a human-readable message.
  73. */
  74. const char *wc_error(int value)
  75. {
  76. value = abs(value);
  77. switch (value) {
  78. case WC_TRAILINGBACKSLASH:
  79. return "'\' occurred at end of string (expected another character)";
  80. case WC_UNCLOSEDCLASS:
  81. return "expected ']' to close character class";
  82. case WC_INVALIDRANGE:
  83. return "character range was not terminated (']' just after '-')";
  84. }
  85. return "INTERNAL ERROR: unrecognised wildcard error number";
  86. }
  87. /*
  88. * This is the routine that tests a target string to see if an
  89. * initial substring of it matches a fragment. If successful, it
  90. * returns 1, and advances both `fragment' and `target' past the
  91. * fragment and matching substring respectively. If unsuccessful it
  92. * returns zero. If the wildcard fragment suffers a syntax error,
  93. * it returns <0 and the precise value indexes into wc_error.
  94. */
  95. static int wc_match_fragment(const char **fragment, const char **target,
  96. const char *target_end)
  97. {
  98. const char *f, *t;
  99. f = *fragment;
  100. t = *target;
  101. /*
  102. * The fragment terminates at either the end of the string, or
  103. * the first (unescaped) *.
  104. */
  105. while (*f && *f != '*' && t < target_end) {
  106. /*
  107. * Extract one character from t, and one character's worth
  108. * of pattern from f, and step along both. Return 0 if they
  109. * fail to match.
  110. */
  111. if (*f == '\\') {
  112. /*
  113. * Backslash, which means f[1] is to be treated as a
  114. * literal character no matter what it is. It may not
  115. * be the end of the string.
  116. */
  117. if (!f[1])
  118. return -WC_TRAILINGBACKSLASH; /* error */
  119. if (f[1] != *t)
  120. return 0; /* failed to match */
  121. f += 2;
  122. } else if (*f == '?') {
  123. /*
  124. * Question mark matches anything.
  125. */
  126. f++;
  127. } else if (*f == '[') {
  128. int invert = 0;
  129. int matched = 0;
  130. /*
  131. * Open bracket introduces a character class.
  132. */
  133. f++;
  134. if (*f == '^') {
  135. invert = 1;
  136. f++;
  137. }
  138. while (*f != ']') {
  139. if (*f == '\\')
  140. f++; /* backslashes still work */
  141. if (!*f)
  142. return -WC_UNCLOSEDCLASS; /* error again */
  143. if (f[1] == '-') {
  144. int lower, upper, ourchr;
  145. lower = (unsigned char) *f++;
  146. f++; /* eat the minus */
  147. if (*f == ']')
  148. return -WC_INVALIDRANGE; /* different error! */
  149. if (*f == '\\')
  150. f++; /* backslashes _still_ work */
  151. if (!*f)
  152. return -WC_UNCLOSEDCLASS; /* error again */
  153. upper = (unsigned char) *f++;
  154. ourchr = (unsigned char) *t;
  155. if (lower > upper) {
  156. int t = lower; lower = upper; upper = t;
  157. }
  158. if (ourchr >= lower && ourchr <= upper)
  159. matched = 1;
  160. } else {
  161. matched |= (*t == *f++);
  162. }
  163. }
  164. if (invert == matched)
  165. return 0; /* failed to match character class */
  166. f++; /* eat the ] */
  167. } else {
  168. /*
  169. * Non-special character matches itself.
  170. */
  171. if (*f != *t)
  172. return 0;
  173. f++;
  174. }
  175. /*
  176. * Now we've done that, increment t past the character we
  177. * matched.
  178. */
  179. t++;
  180. }
  181. if (!*f || *f == '*') {
  182. /*
  183. * We have reached the end of f without finding a mismatch;
  184. * so we're done. Update the caller pointers and return 1.
  185. */
  186. *fragment = f;
  187. *target = t;
  188. return 1;
  189. }
  190. /*
  191. * Otherwise, we must have reached the end of t before we
  192. * reached the end of f; so we've failed. Return 0.
  193. */
  194. return 0;
  195. }
  196. /*
  197. * This is the real wildcard matching routine. It returns 1 for a
  198. * successful match, 0 for an unsuccessful match, and <0 for a
  199. * syntax error in the wildcard.
  200. */
  201. static int wc_match_inner(
  202. const char *wildcard, const char *target, size_t target_len)
  203. {
  204. const char *target_end = target + target_len;
  205. int ret;
  206. /*
  207. * Every time we see a '*' _followed_ by a fragment, we just
  208. * search along the string for a location at which the fragment
  209. * matches. The only special case is when we see a fragment
  210. * right at the start, in which case we just call the matching
  211. * routine once and give up if it fails.
  212. */
  213. if (*wildcard != '*') {
  214. ret = wc_match_fragment(&wildcard, &target, target_end);
  215. if (ret <= 0)
  216. return ret; /* pass back failure or error alike */
  217. }
  218. while (*wildcard) {
  219. assert(*wildcard == '*');
  220. while (*wildcard == '*')
  221. wildcard++;
  222. /*
  223. * It's possible we've just hit the end of the wildcard
  224. * after seeing a *, in which case there's no need to
  225. * bother searching any more because we've won.
  226. */
  227. if (!*wildcard)
  228. return 1;
  229. /*
  230. * Now `wildcard' points at the next fragment. So we
  231. * attempt to match it against `target', and if that fails
  232. * we increment `target' and try again, and so on. When we
  233. * find we're about to try matching against the empty
  234. * string, we give up and return 0.
  235. */
  236. ret = 0;
  237. while (*target) {
  238. const char *save_w = wildcard, *save_t = target;
  239. ret = wc_match_fragment(&wildcard, &target, target_end);
  240. if (ret < 0)
  241. return ret; /* syntax error */
  242. if (ret > 0 && !*wildcard && target != target_end) {
  243. /*
  244. * Final special case - literally.
  245. *
  246. * This situation arises when we are matching a
  247. * _terminal_ fragment of the wildcard (that is,
  248. * there is nothing after it, e.g. "*a"), and it
  249. * has matched _too early_. For example, matching
  250. * "*a" against "parka" will match the "a" fragment
  251. * against the _first_ a, and then (if it weren't
  252. * for this special case) matching would fail
  253. * because we're at the end of the wildcard but not
  254. * at the end of the target string.
  255. *
  256. * In this case what we must do is measure the
  257. * length of the fragment in the target (which is
  258. * why we saved `target'), jump straight to that
  259. * distance from the end of the string using
  260. * strlen, and match the same fragment again there
  261. * (which is why we saved `wildcard'). Then we
  262. * return whatever that operation returns.
  263. */
  264. target = target_end - (target - save_t);
  265. wildcard = save_w;
  266. return wc_match_fragment(&wildcard, &target, target_end);
  267. }
  268. if (ret > 0)
  269. break;
  270. target++;
  271. }
  272. if (ret > 0)
  273. continue;
  274. return 0;
  275. }
  276. /*
  277. * If we reach here, it must be because we successfully matched
  278. * a fragment and then found ourselves right at the end of the
  279. * wildcard. Hence, we return 1 if and only if we are also
  280. * right at the end of the target.
  281. */
  282. return target == target_end;
  283. }
  284. int wc_match(const char *wildcard, const char *target)
  285. {
  286. return wc_match_inner(wildcard, target, strlen(target));
  287. }
  288. int wc_match_pl(const char *wildcard, ptrlen target)
  289. {
  290. return wc_match_inner(wildcard, target.ptr, target.len);
  291. }
  292. /*
  293. * Another utility routine that translates a non-wildcard string
  294. * into its raw equivalent by removing any escaping backslashes.
  295. * Expects a target string buffer of anything up to the length of
  296. * the original wildcard. You can also pass NULL as the output
  297. * buffer if you're only interested in the return value.
  298. *
  299. * Returns 1 on success, or 0 if a wildcard character was
  300. * encountered. In the latter case the output string MAY not be
  301. * zero-terminated and you should not use it for anything!
  302. */
  303. int wc_unescape(char *output, const char *wildcard)
  304. {
  305. while (*wildcard) {
  306. if (*wildcard == '\\') {
  307. wildcard++;
  308. /* We are lenient about trailing backslashes in non-wildcards. */
  309. if (*wildcard) {
  310. if (output)
  311. *output++ = *wildcard;
  312. wildcard++;
  313. }
  314. } else if (*wildcard == '*' || *wildcard == '?' ||
  315. *wildcard == '[' || *wildcard == ']') {
  316. return 0; /* it's a wildcard! */
  317. } else {
  318. if (output)
  319. *output++ = *wildcard;
  320. wildcard++;
  321. }
  322. }
  323. if (output)
  324. *output = '\0';
  325. return 1; /* it's clean */
  326. }
  327. #ifdef TESTMODE
  328. struct test {
  329. const char *wildcard;
  330. const char *target;
  331. int expected_result;
  332. };
  333. const struct test fragment_tests[] = {
  334. /*
  335. * We exhaustively unit-test the fragment matching routine
  336. * itself, which should save us the need to test all its
  337. * intricacies during the full wildcard tests.
  338. */
  339. {"abc", "abc", 1},
  340. {"abc", "abd", 0},
  341. {"abc", "abcd", 1},
  342. {"abcd", "abc", 0},
  343. {"ab[cd]", "abc", 1},
  344. {"ab[cd]", "abd", 1},
  345. {"ab[cd]", "abe", 0},
  346. {"ab[^cd]", "abc", 0},
  347. {"ab[^cd]", "abd", 0},
  348. {"ab[^cd]", "abe", 1},
  349. {"ab\\", "abc", -WC_TRAILINGBACKSLASH},
  350. {"ab\\*", "ab*", 1},
  351. {"ab\\?", "ab*", 0},
  352. {"ab?", "abc", 1},
  353. {"ab?", "ab", 0},
  354. {"ab[", "abc", -WC_UNCLOSEDCLASS},
  355. {"ab[c-", "abb", -WC_UNCLOSEDCLASS},
  356. {"ab[c-]", "abb", -WC_INVALIDRANGE},
  357. {"ab[c-e]", "abb", 0},
  358. {"ab[c-e]", "abc", 1},
  359. {"ab[c-e]", "abd", 1},
  360. {"ab[c-e]", "abe", 1},
  361. {"ab[c-e]", "abf", 0},
  362. {"ab[e-c]", "abb", 0},
  363. {"ab[e-c]", "abc", 1},
  364. {"ab[e-c]", "abd", 1},
  365. {"ab[e-c]", "abe", 1},
  366. {"ab[e-c]", "abf", 0},
  367. {"ab[^c-e]", "abb", 1},
  368. {"ab[^c-e]", "abc", 0},
  369. {"ab[^c-e]", "abd", 0},
  370. {"ab[^c-e]", "abe", 0},
  371. {"ab[^c-e]", "abf", 1},
  372. {"ab[^e-c]", "abb", 1},
  373. {"ab[^e-c]", "abc", 0},
  374. {"ab[^e-c]", "abd", 0},
  375. {"ab[^e-c]", "abe", 0},
  376. {"ab[^e-c]", "abf", 1},
  377. {"ab[a^]", "aba", 1},
  378. {"ab[a^]", "ab^", 1},
  379. {"ab[a^]", "abb", 0},
  380. {"ab[^a^]", "aba", 0},
  381. {"ab[^a^]", "ab^", 0},
  382. {"ab[^a^]", "abb", 1},
  383. {"ab[-c]", "ab-", 1},
  384. {"ab[-c]", "abc", 1},
  385. {"ab[-c]", "abd", 0},
  386. {"ab[^-c]", "ab-", 0},
  387. {"ab[^-c]", "abc", 0},
  388. {"ab[^-c]", "abd", 1},
  389. {"ab[\\[-\\]]", "abZ", 0},
  390. {"ab[\\[-\\]]", "ab[", 1},
  391. {"ab[\\[-\\]]", "ab\\", 1},
  392. {"ab[\\[-\\]]", "ab]", 1},
  393. {"ab[\\[-\\]]", "ab^", 0},
  394. {"ab[^\\[-\\]]", "abZ", 1},
  395. {"ab[^\\[-\\]]", "ab[", 0},
  396. {"ab[^\\[-\\]]", "ab\\", 0},
  397. {"ab[^\\[-\\]]", "ab]", 0},
  398. {"ab[^\\[-\\]]", "ab^", 1},
  399. {"ab[a-fA-F]", "aba", 1},
  400. {"ab[a-fA-F]", "abF", 1},
  401. {"ab[a-fA-F]", "abZ", 0},
  402. };
  403. const struct test full_tests[] = {
  404. {"a", "argh", 0},
  405. {"a", "ba", 0},
  406. {"a", "a", 1},
  407. {"a*", "aardvark", 1},
  408. {"a*", "badger", 0},
  409. {"*a", "park", 0},
  410. {"*a", "pArka", 1},
  411. {"*a", "parka", 1},
  412. {"*a*", "park", 1},
  413. {"*a*", "perk", 0},
  414. {"?b*r?", "abracadabra", 1},
  415. {"?b*r?", "abracadabr", 0},
  416. {"?b*r?", "abracadabzr", 0},
  417. };
  418. int main(void)
  419. {
  420. int i;
  421. int fails, passes;
  422. fails = passes = 0;
  423. for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {
  424. const char *f, *t;
  425. int eret, aret;
  426. f = fragment_tests[i].wildcard;
  427. t = fragment_tests[i].target;
  428. eret = fragment_tests[i].expected_result;
  429. aret = wc_match_fragment(&f, &t, t + strlen(t));
  430. if (aret != eret) {
  431. printf("failed test: /%s/ against /%s/ returned %d not %d\n",
  432. fragment_tests[i].wildcard, fragment_tests[i].target,
  433. aret, eret);
  434. fails++;
  435. } else
  436. passes++;
  437. }
  438. for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {
  439. const char *f, *t;
  440. int eret, aret;
  441. f = full_tests[i].wildcard;
  442. t = full_tests[i].target;
  443. eret = full_tests[i].expected_result;
  444. aret = wc_match(f, t);
  445. if (aret != eret) {
  446. printf("failed test: /%s/ against /%s/ returned %d not %d\n",
  447. full_tests[i].wildcard, full_tests[i].target,
  448. aret, eret);
  449. fails++;
  450. } else
  451. passes++;
  452. }
  453. printf("passed %d, failed %d\n", passes, fails);
  454. return 0;
  455. }
  456. #endif