archive_read_support_format_warc.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858
  1. /*-
  2. * Copyright (c) 2014 Sebastian Freundt
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  15. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  16. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  17. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  18. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  19. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  20. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  21. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  23. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #include "archive_platform.h"
  26. /**
  27. * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
  28. * ISO 28500:2009.
  29. * For the purposes of this file we used the final draft from:
  30. * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
  31. *
  32. * Todo:
  33. * [ ] real-world warcs can contain resources at endpoints ending in /
  34. * e.g. http://bibnum.bnf.fr/warc/
  35. * if you're lucky their response contains a Content-Location: header
  36. * pointing to a unix-compliant filename, in the example above it's
  37. * Content-Location: http://bibnum.bnf.fr/warc/index.html
  38. * however, that's not mandated and github for example doesn't follow
  39. * this convention.
  40. * We need a set of archive options to control what to do with
  41. * entries like these, at the moment care is taken to skip them.
  42. *
  43. **/
  44. #ifdef HAVE_SYS_STAT_H
  45. #include <sys/stat.h>
  46. #endif
  47. #ifdef HAVE_ERRNO_H
  48. #include <errno.h>
  49. #endif
  50. #ifdef HAVE_STDLIB_H
  51. #include <stdlib.h>
  52. #endif
  53. #ifdef HAVE_STRING_H
  54. #include <string.h>
  55. #endif
  56. #ifdef HAVE_LIMITS_H
  57. #include <limits.h>
  58. #endif
  59. #ifdef HAVE_CTYPE_H
  60. #include <ctype.h>
  61. #endif
  62. #ifdef HAVE_TIME_H
  63. #include <time.h>
  64. #endif
  65. #include "archive.h"
  66. #include "archive_entry.h"
  67. #include "archive_private.h"
  68. #include "archive_read_private.h"
  69. typedef enum {
  70. WT_NONE,
  71. /* warcinfo */
  72. WT_INFO,
  73. /* metadata */
  74. WT_META,
  75. /* resource */
  76. WT_RSRC,
  77. /* request, unsupported */
  78. WT_REQ,
  79. /* response, unsupported */
  80. WT_RSP,
  81. /* revisit, unsupported */
  82. WT_RVIS,
  83. /* conversion, unsupported */
  84. WT_CONV,
  85. /* continuation, unsupported at the moment */
  86. WT_CONT,
  87. /* invalid type */
  88. LAST_WT
  89. } warc_type_t;
  90. typedef struct {
  91. size_t len;
  92. const char *str;
  93. } warc_string_t;
  94. typedef struct {
  95. size_t len;
  96. char *str;
  97. } warc_strbuf_t;
  98. struct warc_s {
  99. /* content length ahead */
  100. size_t cntlen;
  101. /* and how much we've processed so far */
  102. size_t cntoff;
  103. /* and how much we need to consume between calls */
  104. size_t unconsumed;
  105. /* string pool */
  106. warc_strbuf_t pool;
  107. /* previous version */
  108. unsigned int pver;
  109. /* stringified format name */
  110. struct archive_string sver;
  111. };
  112. static int _warc_bid(struct archive_read *a, int);
  113. static int _warc_cleanup(struct archive_read *a);
  114. static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
  115. static int _warc_skip(struct archive_read *a);
  116. static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
  117. /* private routines */
  118. static unsigned int _warc_rdver(const char *buf, size_t bsz);
  119. static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
  120. static warc_string_t _warc_rduri(const char *buf, size_t bsz);
  121. static ssize_t _warc_rdlen(const char *buf, size_t bsz);
  122. static time_t _warc_rdrtm(const char *buf, size_t bsz);
  123. static time_t _warc_rdmtm(const char *buf, size_t bsz);
  124. static const char *_warc_find_eoh(const char *buf, size_t bsz);
  125. static const char *_warc_find_eol(const char *buf, size_t bsz);
  126. int
  127. archive_read_support_format_warc(struct archive *_a)
  128. {
  129. struct archive_read *a = (struct archive_read *)_a;
  130. struct warc_s *w;
  131. int r;
  132. archive_check_magic(_a, ARCHIVE_READ_MAGIC,
  133. ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
  134. if ((w = calloc(1, sizeof(*w))) == NULL) {
  135. archive_set_error(&a->archive, ENOMEM,
  136. "Can't allocate warc data");
  137. return (ARCHIVE_FATAL);
  138. }
  139. r = __archive_read_register_format(
  140. a, w, "warc",
  141. _warc_bid, NULL, _warc_rdhdr, _warc_read,
  142. _warc_skip, NULL, _warc_cleanup, NULL, NULL);
  143. if (r != ARCHIVE_OK) {
  144. free(w);
  145. return (r);
  146. }
  147. return (ARCHIVE_OK);
  148. }
  149. static int
  150. _warc_cleanup(struct archive_read *a)
  151. {
  152. struct warc_s *w = a->format->data;
  153. if (w->pool.len > 0U) {
  154. free(w->pool.str);
  155. }
  156. archive_string_free(&w->sver);
  157. free(w);
  158. a->format->data = NULL;
  159. return (ARCHIVE_OK);
  160. }
  161. static int
  162. _warc_bid(struct archive_read *a, int best_bid)
  163. {
  164. const char *hdr;
  165. ssize_t nrd;
  166. unsigned int ver;
  167. (void)best_bid; /* UNUSED */
  168. /* check first line of file, it should be a record already */
  169. if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
  170. /* no idea what to do */
  171. return -1;
  172. } else if (nrd < 12) {
  173. /* nah, not for us, our magic cookie is at least 12 bytes */
  174. return -1;
  175. }
  176. /* otherwise snarf the record's version number */
  177. ver = _warc_rdver(hdr, nrd);
  178. if (ver < 1200U || ver > 10000U) {
  179. /* we only support WARC 0.12 to 1.0 */
  180. return -1;
  181. }
  182. /* otherwise be confident */
  183. return (64);
  184. }
  185. static int
  186. _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
  187. {
  188. #define HDR_PROBE_LEN (12U)
  189. struct warc_s *w = a->format->data;
  190. unsigned int ver;
  191. const char *buf;
  192. ssize_t nrd;
  193. const char *eoh;
  194. char *tmp;
  195. /* for the file name, saves some strndup()'ing */
  196. warc_string_t fnam;
  197. /* warc record type, not that we really use it a lot */
  198. warc_type_t ftyp;
  199. /* content-length+error monad */
  200. ssize_t cntlen;
  201. /* record time is the WARC-Date time we reinterpret it as ctime */
  202. time_t rtime;
  203. /* mtime is the Last-Modified time which will be the entry's mtime */
  204. time_t mtime;
  205. start_over:
  206. /* just use read_ahead() they keep track of unconsumed
  207. * bits and bobs for us; no need to put an extra shift in
  208. * and reproduce that functionality here */
  209. buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
  210. if (nrd < 0) {
  211. /* no good */
  212. archive_set_error(
  213. &a->archive, ARCHIVE_ERRNO_MISC,
  214. "Bad record header");
  215. return (ARCHIVE_FATAL);
  216. } else if (buf == NULL) {
  217. /* there should be room for at least WARC/bla\r\n
  218. * must be EOF therefore */
  219. return (ARCHIVE_EOF);
  220. }
  221. /* looks good so far, try and find the end of the header now */
  222. eoh = _warc_find_eoh(buf, nrd);
  223. if (eoh == NULL) {
  224. /* still no good, the header end might be beyond the
  225. * probe we've requested, but then again who'd cram
  226. * so much stuff into the header *and* be 28500-compliant */
  227. archive_set_error(
  228. &a->archive, ARCHIVE_ERRNO_MISC,
  229. "Bad record header");
  230. return (ARCHIVE_FATAL);
  231. }
  232. ver = _warc_rdver(buf, eoh - buf);
  233. /* we currently support WARC 0.12 to 1.0 */
  234. if (ver == 0U) {
  235. archive_set_error(
  236. &a->archive, ARCHIVE_ERRNO_MISC,
  237. "Invalid record version");
  238. return (ARCHIVE_FATAL);
  239. } else if (ver < 1200U || ver > 10000U) {
  240. archive_set_error(
  241. &a->archive, ARCHIVE_ERRNO_MISC,
  242. "Unsupported record version: %u.%u",
  243. ver / 10000, (ver % 10000) / 100);
  244. return (ARCHIVE_FATAL);
  245. }
  246. cntlen = _warc_rdlen(buf, eoh - buf);
  247. if (cntlen < 0) {
  248. /* nightmare! the specs say content-length is mandatory
  249. * so I don't feel overly bad stopping the reader here */
  250. archive_set_error(
  251. &a->archive, EINVAL,
  252. "Bad content length");
  253. return (ARCHIVE_FATAL);
  254. }
  255. rtime = _warc_rdrtm(buf, eoh - buf);
  256. if (rtime == (time_t)-1) {
  257. /* record time is mandatory as per WARC/1.0,
  258. * so just barf here, fast and loud */
  259. archive_set_error(
  260. &a->archive, EINVAL,
  261. "Bad record time");
  262. return (ARCHIVE_FATAL);
  263. }
  264. /* let the world know we're a WARC archive */
  265. a->archive.archive_format = ARCHIVE_FORMAT_WARC;
  266. if (ver != w->pver) {
  267. /* stringify this entry's version */
  268. archive_string_sprintf(&w->sver,
  269. "WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
  270. /* remember the version */
  271. w->pver = ver;
  272. }
  273. /* start off with the type */
  274. ftyp = _warc_rdtyp(buf, eoh - buf);
  275. /* and let future calls know about the content */
  276. w->cntlen = cntlen;
  277. w->cntoff = 0U;
  278. mtime = 0;/* Avoid compiling error on some platform. */
  279. switch (ftyp) {
  280. case WT_RSRC:
  281. case WT_RSP:
  282. /* only try and read the filename in the cases that are
  283. * guaranteed to have one */
  284. fnam = _warc_rduri(buf, eoh - buf);
  285. /* check the last character in the URI to avoid creating
  286. * directory endpoints as files, see Todo above */
  287. if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
  288. /* break here for now */
  289. fnam.len = 0U;
  290. fnam.str = NULL;
  291. break;
  292. }
  293. /* bang to our string pool, so we save a
  294. * malloc()+free() roundtrip */
  295. if (fnam.len + 1U > w->pool.len) {
  296. w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
  297. tmp = realloc(w->pool.str, w->pool.len);
  298. if (tmp == NULL) {
  299. archive_set_error(
  300. &a->archive, ENOMEM,
  301. "Out of memory");
  302. return (ARCHIVE_FATAL);
  303. }
  304. w->pool.str = tmp;
  305. }
  306. memcpy(w->pool.str, fnam.str, fnam.len);
  307. w->pool.str[fnam.len] = '\0';
  308. /* let no one else know about the pool, it's a secret, shhh */
  309. fnam.str = w->pool.str;
  310. /* snarf mtime or deduce from rtime
  311. * this is a custom header added by our writer, it's quite
  312. * hard to believe anyone else would go through with it
  313. * (apart from being part of some http responses of course) */
  314. if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
  315. mtime = rtime;
  316. }
  317. break;
  318. case WT_NONE:
  319. case WT_INFO:
  320. case WT_META:
  321. case WT_REQ:
  322. case WT_RVIS:
  323. case WT_CONV:
  324. case WT_CONT:
  325. case LAST_WT:
  326. default:
  327. fnam.len = 0U;
  328. fnam.str = NULL;
  329. break;
  330. }
  331. /* now eat some of those delicious buffer bits */
  332. __archive_read_consume(a, eoh - buf);
  333. switch (ftyp) {
  334. case WT_RSRC:
  335. case WT_RSP:
  336. if (fnam.len > 0U) {
  337. /* populate entry object */
  338. archive_entry_set_filetype(entry, AE_IFREG);
  339. archive_entry_copy_pathname(entry, fnam.str);
  340. archive_entry_set_size(entry, cntlen);
  341. archive_entry_set_perm(entry, 0644);
  342. /* rtime is the new ctime, mtime stays mtime */
  343. archive_entry_set_ctime(entry, rtime, 0L);
  344. archive_entry_set_mtime(entry, mtime, 0L);
  345. break;
  346. }
  347. /* FALLTHROUGH */
  348. case WT_NONE:
  349. case WT_INFO:
  350. case WT_META:
  351. case WT_REQ:
  352. case WT_RVIS:
  353. case WT_CONV:
  354. case WT_CONT:
  355. case LAST_WT:
  356. default:
  357. /* consume the content and start over */
  358. if (_warc_skip(a) < 0)
  359. return (ARCHIVE_FATAL);
  360. goto start_over;
  361. }
  362. return (ARCHIVE_OK);
  363. }
  364. static int
  365. _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
  366. {
  367. struct warc_s *w = a->format->data;
  368. const char *rab;
  369. ssize_t nrd;
  370. if (w->cntoff >= w->cntlen) {
  371. eof:
  372. /* it's our lucky day, no work, we can leave early */
  373. *buf = NULL;
  374. *bsz = 0U;
  375. *off = w->cntoff + 4U/*for \r\n\r\n separator*/;
  376. w->unconsumed = 0U;
  377. return (ARCHIVE_EOF);
  378. }
  379. if (w->unconsumed) {
  380. __archive_read_consume(a, w->unconsumed);
  381. w->unconsumed = 0U;
  382. }
  383. rab = __archive_read_ahead(a, 1U, &nrd);
  384. if (nrd < 0) {
  385. *bsz = 0U;
  386. /* big catastrophe */
  387. return (int)nrd;
  388. } else if (nrd == 0) {
  389. goto eof;
  390. } else if ((size_t)nrd > w->cntlen - w->cntoff) {
  391. /* clamp to content-length */
  392. nrd = w->cntlen - w->cntoff;
  393. }
  394. *off = w->cntoff;
  395. *bsz = nrd;
  396. *buf = rab;
  397. w->cntoff += nrd;
  398. w->unconsumed = (size_t)nrd;
  399. return (ARCHIVE_OK);
  400. }
  401. static int
  402. _warc_skip(struct archive_read *a)
  403. {
  404. struct warc_s *w = a->format->data;
  405. if (__archive_read_consume(a, w->cntlen) < 0 ||
  406. __archive_read_consume(a, 4U/*\r\n\r\n separator*/) < 0)
  407. return (ARCHIVE_FATAL);
  408. w->cntlen = 0U;
  409. w->cntoff = 0U;
  410. return (ARCHIVE_OK);
  411. }
  412. /* private routines */
  413. static void*
  414. deconst(const void *c)
  415. {
  416. return (void *)(uintptr_t)c;
  417. }
  418. static char*
  419. xmemmem(const char *hay, const size_t haysize,
  420. const char *needle, const size_t needlesize)
  421. {
  422. const char *const eoh = hay + haysize;
  423. const char *const eon = needle + needlesize;
  424. const char *hp;
  425. const char *np;
  426. const char *cand;
  427. unsigned int hsum;
  428. unsigned int nsum;
  429. unsigned int eqp;
  430. /* trivial checks first
  431. * a 0-sized needle is defined to be found anywhere in haystack
  432. * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
  433. * that happens to begin with *NEEDLE) */
  434. if (needlesize == 0UL) {
  435. return deconst(hay);
  436. } else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
  437. /* trivial */
  438. return NULL;
  439. }
  440. /* First characters of haystack and needle are the same now. Both are
  441. * guaranteed to be at least one character long. Now computes the sum
  442. * of characters values of needle together with the sum of the first
  443. * needle_len characters of haystack. */
  444. for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
  445. hp < eoh && np < eon;
  446. hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
  447. /* HP now references the (NEEDLESIZE + 1)-th character. */
  448. if (np < eon) {
  449. /* haystack is smaller than needle, :O */
  450. return NULL;
  451. } else if (eqp) {
  452. /* found a match */
  453. return deconst(hay);
  454. }
  455. /* now loop through the rest of haystack,
  456. * updating the sum iteratively */
  457. for (cand = hay; hp < eoh; hp++) {
  458. hsum ^= *cand++;
  459. hsum ^= *hp;
  460. /* Since the sum of the characters is already known to be
  461. * equal at that point, it is enough to check just NEEDLESIZE - 1
  462. * characters for equality,
  463. * also CAND is by design < HP, so no need for range checks */
  464. if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
  465. return deconst(cand);
  466. }
  467. }
  468. return NULL;
  469. }
  470. static int
  471. strtoi_lim(const char *str, const char **ep, int llim, int ulim)
  472. {
  473. int res = 0;
  474. const char *sp;
  475. /* we keep track of the number of digits via rulim */
  476. int rulim;
  477. for (sp = str, rulim = ulim > 10 ? ulim : 10;
  478. res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
  479. sp++, rulim /= 10) {
  480. res *= 10;
  481. res += *sp - '0';
  482. }
  483. if (sp == str) {
  484. res = -1;
  485. } else if (res < llim || res > ulim) {
  486. res = -2;
  487. }
  488. *ep = (const char*)sp;
  489. return res;
  490. }
  491. static time_t
  492. time_from_tm(struct tm *t)
  493. {
  494. #if HAVE__MKGMTIME
  495. return _mkgmtime(t);
  496. #elif HAVE_TIMEGM
  497. /* Use platform timegm() if available. */
  498. return (timegm(t));
  499. #else
  500. /* Else use direct calculation using POSIX assumptions. */
  501. /* First, fix up tm_yday based on the year/month/day. */
  502. if (mktime(t) == (time_t)-1)
  503. return ((time_t)-1);
  504. /* Then we can compute timegm() from first principles. */
  505. return (t->tm_sec
  506. + t->tm_min * 60
  507. + t->tm_hour * 3600
  508. + t->tm_yday * 86400
  509. + (t->tm_year - 70) * 31536000
  510. + ((t->tm_year - 69) / 4) * 86400
  511. - ((t->tm_year - 1) / 100) * 86400
  512. + ((t->tm_year + 299) / 400) * 86400);
  513. #endif
  514. }
  515. static time_t
  516. xstrpisotime(const char *s, char **endptr)
  517. {
  518. /** like strptime() but strictly for ISO 8601 Zulu strings */
  519. struct tm tm;
  520. time_t res = (time_t)-1;
  521. /* make sure tm is clean */
  522. memset(&tm, 0, sizeof(tm));
  523. /* as a courtesy to our callers, and since this is a non-standard
  524. * routine, we skip leading whitespace */
  525. while (*s == ' ' || *s == '\t')
  526. ++s;
  527. /* read year */
  528. if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
  529. goto out;
  530. }
  531. /* read month */
  532. if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
  533. goto out;
  534. }
  535. /* read day-of-month */
  536. if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
  537. goto out;
  538. }
  539. /* read hour */
  540. if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
  541. goto out;
  542. }
  543. /* read minute */
  544. if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
  545. goto out;
  546. }
  547. /* read second */
  548. if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
  549. goto out;
  550. }
  551. /* massage TM to fulfill some of POSIX' constraints */
  552. tm.tm_year -= 1900;
  553. tm.tm_mon--;
  554. /* now convert our custom tm struct to a unix stamp using UTC */
  555. res = time_from_tm(&tm);
  556. out:
  557. if (endptr != NULL) {
  558. *endptr = deconst(s);
  559. }
  560. return res;
  561. }
  562. static unsigned int
  563. _warc_rdver(const char *buf, size_t bsz)
  564. {
  565. static const char magic[] = "WARC/";
  566. const char *c;
  567. unsigned int ver = 0U;
  568. unsigned int end = 0U;
  569. if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
  570. /* buffer too small or invalid magic */
  571. return ver;
  572. }
  573. /* looks good so far, read the version number for a laugh */
  574. buf += sizeof(magic) - 1U;
  575. if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
  576. isdigit((unsigned char)buf[2U])) {
  577. /* we support a maximum of 2 digits in the minor version */
  578. if (isdigit((unsigned char)buf[3U]))
  579. end = 1U;
  580. /* set up major version */
  581. ver = (buf[0U] - '0') * 10000U;
  582. /* set up minor version */
  583. if (end == 1U) {
  584. ver += (buf[2U] - '0') * 1000U;
  585. ver += (buf[3U] - '0') * 100U;
  586. } else
  587. ver += (buf[2U] - '0') * 100U;
  588. /*
  589. * WARC below version 0.12 has a space-separated header
  590. * WARC 0.12 and above terminates the version with a CRLF
  591. */
  592. c = buf + 3U + end;
  593. if (ver >= 1200U) {
  594. if (memcmp(c, "\r\n", 2U) != 0)
  595. ver = 0U;
  596. } else {
  597. /* ver < 1200U */
  598. if (*c != ' ' && *c != '\t')
  599. ver = 0U;
  600. }
  601. }
  602. return ver;
  603. }
  604. static unsigned int
  605. _warc_rdtyp(const char *buf, size_t bsz)
  606. {
  607. static const char _key[] = "\r\nWARC-Type:";
  608. const char *val, *eol;
  609. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  610. /* no bother */
  611. return WT_NONE;
  612. }
  613. val += sizeof(_key) - 1U;
  614. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
  615. /* no end of line */
  616. return WT_NONE;
  617. }
  618. /* overread whitespace */
  619. while (val < eol && (*val == ' ' || *val == '\t'))
  620. ++val;
  621. if (val + 8U == eol) {
  622. if (memcmp(val, "resource", 8U) == 0)
  623. return WT_RSRC;
  624. else if (memcmp(val, "response", 8U) == 0)
  625. return WT_RSP;
  626. }
  627. return WT_NONE;
  628. }
  629. static warc_string_t
  630. _warc_rduri(const char *buf, size_t bsz)
  631. {
  632. static const char _key[] = "\r\nWARC-Target-URI:";
  633. const char *val, *uri, *eol, *p;
  634. warc_string_t res = {0U, NULL};
  635. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  636. /* no bother */
  637. return res;
  638. }
  639. /* overread whitespace */
  640. val += sizeof(_key) - 1U;
  641. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
  642. /* no end of line */
  643. return res;
  644. }
  645. while (val < eol && (*val == ' ' || *val == '\t'))
  646. ++val;
  647. /* overread URL designators */
  648. if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
  649. /* not touching that! */
  650. return res;
  651. }
  652. /* spaces inside uri are not allowed, CRLF should follow */
  653. for (p = val; p < eol; p++) {
  654. if (isspace((unsigned char)*p))
  655. return res;
  656. }
  657. /* there must be at least space for ftp */
  658. if (uri < (val + 3U))
  659. return res;
  660. /* move uri to point to after :// */
  661. uri += 3U;
  662. /* now then, inspect the URI */
  663. if (memcmp(val, "file", 4U) == 0) {
  664. /* perfect, nothing left to do here */
  665. } else if (memcmp(val, "http", 4U) == 0 ||
  666. memcmp(val, "ftp", 3U) == 0) {
  667. /* overread domain, and the first / */
  668. while (uri < eol && *uri++ != '/');
  669. } else {
  670. /* not sure what to do? best to bugger off */
  671. return res;
  672. }
  673. res.str = uri;
  674. res.len = eol - uri;
  675. return res;
  676. }
  677. static ssize_t
  678. _warc_rdlen(const char *buf, size_t bsz)
  679. {
  680. static const char _key[] = "\r\nContent-Length:";
  681. const char *val, *eol;
  682. char *on = NULL;
  683. long int len;
  684. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  685. /* no bother */
  686. return -1;
  687. }
  688. val += sizeof(_key) - 1U;
  689. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
  690. /* no end of line */
  691. return -1;
  692. }
  693. /* skip leading whitespace */
  694. while (val < eol && (*val == ' ' || *val == '\t'))
  695. val++;
  696. /* there must be at least one digit */
  697. if (!isdigit((unsigned char)*val))
  698. return -1;
  699. errno = 0;
  700. len = strtol(val, &on, 10);
  701. if (errno != 0 || on != eol) {
  702. /* line must end here */
  703. return -1;
  704. }
  705. return (size_t)len;
  706. }
  707. static time_t
  708. _warc_rdrtm(const char *buf, size_t bsz)
  709. {
  710. static const char _key[] = "\r\nWARC-Date:";
  711. const char *val, *eol;
  712. char *on = NULL;
  713. time_t res;
  714. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  715. /* no bother */
  716. return (time_t)-1;
  717. }
  718. val += sizeof(_key) - 1U;
  719. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
  720. /* no end of line */
  721. return -1;
  722. }
  723. /* xstrpisotime() kindly overreads whitespace for us, so use that */
  724. res = xstrpisotime(val, &on);
  725. if (on != eol) {
  726. /* line must end here */
  727. return -1;
  728. }
  729. return res;
  730. }
  731. static time_t
  732. _warc_rdmtm(const char *buf, size_t bsz)
  733. {
  734. static const char _key[] = "\r\nLast-Modified:";
  735. const char *val, *eol;
  736. char *on = NULL;
  737. time_t res;
  738. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  739. /* no bother */
  740. return (time_t)-1;
  741. }
  742. val += sizeof(_key) - 1U;
  743. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
  744. /* no end of line */
  745. return -1;
  746. }
  747. /* xstrpisotime() kindly overreads whitespace for us, so use that */
  748. res = xstrpisotime(val, &on);
  749. if (on != eol) {
  750. /* line must end here */
  751. return -1;
  752. }
  753. return res;
  754. }
  755. static const char*
  756. _warc_find_eoh(const char *buf, size_t bsz)
  757. {
  758. static const char _marker[] = "\r\n\r\n";
  759. const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
  760. if (hit != NULL) {
  761. hit += sizeof(_marker) - 1U;
  762. }
  763. return hit;
  764. }
  765. static const char*
  766. _warc_find_eol(const char *buf, size_t bsz)
  767. {
  768. static const char _marker[] = "\r\n";
  769. const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
  770. return hit;
  771. }
  772. /* archive_read_support_format_warc.c ends here */