archive_read_support_format_warc.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832
  1. /*-
  2. * Copyright (c) 2014 Sebastian Freundt
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  15. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  16. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  17. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  18. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  19. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  20. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  21. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  23. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #include "archive_platform.h"
  26. __FBSDID("$FreeBSD$");
  27. /**
  28. * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
  29. * ISO 28500:2009.
  30. * For the purposes of this file we used the final draft from:
  31. * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
  32. *
  33. * Todo:
  34. * [ ] real-world warcs can contain resources at endpoints ending in /
  35. * e.g. http://bibnum.bnf.fr/warc/
  36. * if you're lucky their response contains a Content-Location: header
  37. * pointing to a unix-compliant filename, in the example above it's
  38. * Content-Location: http://bibnum.bnf.fr/warc/index.html
  39. * however, that's not mandated and github for example doesn't follow
  40. * this convention.
  41. * We need a set of archive options to control what to do with
  42. * entries like these, at the moment care is taken to skip them.
  43. *
  44. **/
  45. #ifdef HAVE_SYS_STAT_H
  46. #include <sys/stat.h>
  47. #endif
  48. #ifdef HAVE_ERRNO_H
  49. #include <errno.h>
  50. #endif
  51. #ifdef HAVE_STDLIB_H
  52. #include <stdlib.h>
  53. #endif
  54. #ifdef HAVE_STRING_H
  55. #include <string.h>
  56. #endif
  57. #ifdef HAVE_LIMITS_H
  58. #include <limits.h>
  59. #endif
  60. #ifdef HAVE_CTYPE_H
  61. #include <ctype.h>
  62. #endif
  63. #ifdef HAVE_TIME_H
  64. #include <time.h>
  65. #endif
  66. #include "archive.h"
  67. #include "archive_entry.h"
  68. #include "archive_private.h"
  69. #include "archive_read_private.h"
  70. typedef enum {
  71. WT_NONE,
  72. /* warcinfo */
  73. WT_INFO,
  74. /* metadata */
  75. WT_META,
  76. /* resource */
  77. WT_RSRC,
  78. /* request, unsupported */
  79. WT_REQ,
  80. /* response, unsupported */
  81. WT_RSP,
  82. /* revisit, unsupported */
  83. WT_RVIS,
  84. /* conversion, unsupported */
  85. WT_CONV,
  86. /* continuation, unsupported at the moment */
  87. WT_CONT,
  88. /* invalid type */
  89. LAST_WT
  90. } warc_type_t;
  91. typedef struct {
  92. size_t len;
  93. const char *str;
  94. } warc_string_t;
  95. typedef struct {
  96. size_t len;
  97. char *str;
  98. } warc_strbuf_t;
  99. struct warc_s {
  100. /* content length ahead */
  101. size_t cntlen;
  102. /* and how much we've processed so far */
  103. size_t cntoff;
  104. /* and how much we need to consume between calls */
  105. size_t unconsumed;
  106. /* string pool */
  107. warc_strbuf_t pool;
  108. /* previous version */
  109. unsigned int pver;
  110. /* stringified format name */
  111. struct archive_string sver;
  112. };
  113. static int _warc_bid(struct archive_read *a, int);
  114. static int _warc_cleanup(struct archive_read *a);
  115. static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
  116. static int _warc_skip(struct archive_read *a);
  117. static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
  118. /* private routines */
  119. static unsigned int _warc_rdver(const char buf[10], size_t bsz);
  120. static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
  121. static warc_string_t _warc_rduri(const char *buf, size_t bsz);
  122. static ssize_t _warc_rdlen(const char *buf, size_t bsz);
  123. static time_t _warc_rdrtm(const char *buf, size_t bsz);
  124. static time_t _warc_rdmtm(const char *buf, size_t bsz);
  125. static const char *_warc_find_eoh(const char *buf, size_t bsz);
  126. static const char *_warc_find_eol(const char *buf, size_t bsz);
  127. int
  128. archive_read_support_format_warc(struct archive *_a)
  129. {
  130. struct archive_read *a = (struct archive_read *)_a;
  131. struct warc_s *w;
  132. int r;
  133. archive_check_magic(_a, ARCHIVE_READ_MAGIC,
  134. ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
  135. if ((w = calloc(1, sizeof(*w))) == NULL) {
  136. archive_set_error(&a->archive, ENOMEM,
  137. "Can't allocate warc data");
  138. return (ARCHIVE_FATAL);
  139. }
  140. r = __archive_read_register_format(
  141. a, w, "warc",
  142. _warc_bid, NULL, _warc_rdhdr, _warc_read,
  143. _warc_skip, NULL, _warc_cleanup, NULL, NULL);
  144. if (r != ARCHIVE_OK) {
  145. free(w);
  146. return (r);
  147. }
  148. return (ARCHIVE_OK);
  149. }
  150. static int
  151. _warc_cleanup(struct archive_read *a)
  152. {
  153. struct warc_s *w = a->format->data;
  154. if (w->pool.len > 0U) {
  155. free(w->pool.str);
  156. }
  157. archive_string_free(&w->sver);
  158. free(w);
  159. a->format->data = NULL;
  160. return (ARCHIVE_OK);
  161. }
  162. static int
  163. _warc_bid(struct archive_read *a, int best_bid)
  164. {
  165. const char *hdr;
  166. ssize_t nrd;
  167. unsigned int ver;
  168. (void)best_bid; /* UNUSED */
  169. /* check first line of file, it should be a record already */
  170. if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
  171. /* no idea what to do */
  172. return -1;
  173. } else if (nrd < 12) {
  174. /* nah, not for us, our magic cookie is at least 12 bytes */
  175. return -1;
  176. }
  177. /* otherwise snarf the record's version number */
  178. ver = _warc_rdver(hdr, nrd);
  179. if (ver < 1200U || ver > 10000U) {
  180. /* we only support WARC 0.12 to 1.0 */
  181. return -1;
  182. }
  183. /* otherwise be confident */
  184. return (64);
  185. }
  186. static int
  187. _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
  188. {
  189. #define HDR_PROBE_LEN (12U)
  190. struct warc_s *w = a->format->data;
  191. unsigned int ver;
  192. const char *buf;
  193. ssize_t nrd;
  194. const char *eoh;
  195. /* for the file name, saves some strndup()'ing */
  196. warc_string_t fnam;
  197. /* warc record type, not that we really use it a lot */
  198. warc_type_t ftyp;
  199. /* content-length+error monad */
  200. ssize_t cntlen;
  201. /* record time is the WARC-Date time we reinterpret it as ctime */
  202. time_t rtime;
  203. /* mtime is the Last-Modified time which will be the entry's mtime */
  204. time_t mtime;
  205. start_over:
  206. /* just use read_ahead() they keep track of unconsumed
  207. * bits and bobs for us; no need to put an extra shift in
  208. * and reproduce that functionality here */
  209. buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
  210. if (nrd < 0) {
  211. /* no good */
  212. archive_set_error(
  213. &a->archive, ARCHIVE_ERRNO_MISC,
  214. "Bad record header");
  215. return (ARCHIVE_FATAL);
  216. } else if (buf == NULL) {
  217. /* there should be room for at least WARC/bla\r\n
  218. * must be EOF therefore */
  219. return (ARCHIVE_EOF);
  220. }
  221. /* looks good so far, try and find the end of the header now */
  222. eoh = _warc_find_eoh(buf, nrd);
  223. if (eoh == NULL) {
  224. /* still no good, the header end might be beyond the
  225. * probe we've requested, but then again who'd cram
  226. * so much stuff into the header *and* be 28500-compliant */
  227. archive_set_error(
  228. &a->archive, ARCHIVE_ERRNO_MISC,
  229. "Bad record header");
  230. return (ARCHIVE_FATAL);
  231. }
  232. ver = _warc_rdver(buf, eoh - buf);
  233. /* we currently support WARC 0.12 to 1.0 */
  234. if (ver == 0U) {
  235. archive_set_error(
  236. &a->archive, ARCHIVE_ERRNO_MISC,
  237. "Invalid record version");
  238. return (ARCHIVE_FATAL);
  239. } else if (ver < 1200U || ver > 10000U) {
  240. archive_set_error(
  241. &a->archive, ARCHIVE_ERRNO_MISC,
  242. "Unsupported record version: %u.%u",
  243. ver / 10000, (ver % 10000) / 100);
  244. return (ARCHIVE_FATAL);
  245. }
  246. cntlen = _warc_rdlen(buf, eoh - buf);
  247. if (cntlen < 0) {
  248. /* nightmare! the specs say content-length is mandatory
  249. * so I don't feel overly bad stopping the reader here */
  250. archive_set_error(
  251. &a->archive, EINVAL,
  252. "Bad content length");
  253. return (ARCHIVE_FATAL);
  254. }
  255. rtime = _warc_rdrtm(buf, eoh - buf);
  256. if (rtime == (time_t)-1) {
  257. /* record time is mandatory as per WARC/1.0,
  258. * so just barf here, fast and loud */
  259. archive_set_error(
  260. &a->archive, EINVAL,
  261. "Bad record time");
  262. return (ARCHIVE_FATAL);
  263. }
  264. /* let the world know we're a WARC archive */
  265. a->archive.archive_format = ARCHIVE_FORMAT_WARC;
  266. if (ver != w->pver) {
  267. /* stringify this entry's version */
  268. archive_string_sprintf(&w->sver,
  269. "WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
  270. /* remember the version */
  271. w->pver = ver;
  272. }
  273. /* start off with the type */
  274. ftyp = _warc_rdtyp(buf, eoh - buf);
  275. /* and let future calls know about the content */
  276. w->cntlen = cntlen;
  277. w->cntoff = 0U;
  278. mtime = 0;/* Avoid compiling error on some platform. */
  279. switch (ftyp) {
  280. case WT_RSRC:
  281. case WT_RSP:
  282. /* only try and read the filename in the cases that are
  283. * guaranteed to have one */
  284. fnam = _warc_rduri(buf, eoh - buf);
  285. /* check the last character in the URI to avoid creating
  286. * directory endpoints as files, see Todo above */
  287. if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
  288. /* break here for now */
  289. fnam.len = 0U;
  290. fnam.str = NULL;
  291. break;
  292. }
  293. /* bang to our string pool, so we save a
  294. * malloc()+free() roundtrip */
  295. if (fnam.len + 1U > w->pool.len) {
  296. w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
  297. w->pool.str = realloc(w->pool.str, w->pool.len);
  298. }
  299. memcpy(w->pool.str, fnam.str, fnam.len);
  300. w->pool.str[fnam.len] = '\0';
  301. /* let no one else know about the pool, it's a secret, shhh */
  302. fnam.str = w->pool.str;
  303. /* snarf mtime or deduce from rtime
  304. * this is a custom header added by our writer, it's quite
  305. * hard to believe anyone else would go through with it
  306. * (apart from being part of some http responses of course) */
  307. if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
  308. mtime = rtime;
  309. }
  310. break;
  311. default:
  312. fnam.len = 0U;
  313. fnam.str = NULL;
  314. break;
  315. }
  316. /* now eat some of those delicious buffer bits */
  317. __archive_read_consume(a, eoh - buf);
  318. switch (ftyp) {
  319. case WT_RSRC:
  320. case WT_RSP:
  321. if (fnam.len > 0U) {
  322. /* populate entry object */
  323. archive_entry_set_filetype(entry, AE_IFREG);
  324. archive_entry_copy_pathname(entry, fnam.str);
  325. archive_entry_set_size(entry, cntlen);
  326. archive_entry_set_perm(entry, 0644);
  327. /* rtime is the new ctime, mtime stays mtime */
  328. archive_entry_set_ctime(entry, rtime, 0L);
  329. archive_entry_set_mtime(entry, mtime, 0L);
  330. break;
  331. }
  332. /* FALLTHROUGH */
  333. default:
  334. /* consume the content and start over */
  335. _warc_skip(a);
  336. goto start_over;
  337. }
  338. return (ARCHIVE_OK);
  339. }
  340. static int
  341. _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
  342. {
  343. struct warc_s *w = a->format->data;
  344. const char *rab;
  345. ssize_t nrd;
  346. if (w->cntoff >= w->cntlen) {
  347. eof:
  348. /* it's our lucky day, no work, we can leave early */
  349. *buf = NULL;
  350. *bsz = 0U;
  351. *off = w->cntoff + 4U/*for \r\n\r\n separator*/;
  352. w->unconsumed = 0U;
  353. return (ARCHIVE_EOF);
  354. }
  355. if (w->unconsumed) {
  356. __archive_read_consume(a, w->unconsumed);
  357. w->unconsumed = 0U;
  358. }
  359. rab = __archive_read_ahead(a, 1U, &nrd);
  360. if (nrd < 0) {
  361. *bsz = 0U;
  362. /* big catastrophe */
  363. return (int)nrd;
  364. } else if (nrd == 0) {
  365. goto eof;
  366. } else if ((size_t)nrd > w->cntlen - w->cntoff) {
  367. /* clamp to content-length */
  368. nrd = w->cntlen - w->cntoff;
  369. }
  370. *off = w->cntoff;
  371. *bsz = nrd;
  372. *buf = rab;
  373. w->cntoff += nrd;
  374. w->unconsumed = (size_t)nrd;
  375. return (ARCHIVE_OK);
  376. }
  377. static int
  378. _warc_skip(struct archive_read *a)
  379. {
  380. struct warc_s *w = a->format->data;
  381. __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
  382. w->cntlen = 0U;
  383. w->cntoff = 0U;
  384. return (ARCHIVE_OK);
  385. }
  386. /* private routines */
  387. static void*
  388. deconst(const void *c)
  389. {
  390. return (char *)0x1 + (((const char *)c) - (const char *)0x1);
  391. }
  392. static char*
  393. xmemmem(const char *hay, const size_t haysize,
  394. const char *needle, const size_t needlesize)
  395. {
  396. const char *const eoh = hay + haysize;
  397. const char *const eon = needle + needlesize;
  398. const char *hp;
  399. const char *np;
  400. const char *cand;
  401. unsigned int hsum;
  402. unsigned int nsum;
  403. unsigned int eqp;
  404. /* trivial checks first
  405. * a 0-sized needle is defined to be found anywhere in haystack
  406. * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
  407. * that happens to begin with *NEEDLE) */
  408. if (needlesize == 0UL) {
  409. return deconst(hay);
  410. } else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
  411. /* trivial */
  412. return NULL;
  413. }
  414. /* First characters of haystack and needle are the same now. Both are
  415. * guaranteed to be at least one character long. Now computes the sum
  416. * of characters values of needle together with the sum of the first
  417. * needle_len characters of haystack. */
  418. for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
  419. hp < eoh && np < eon;
  420. hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
  421. /* HP now references the (NEEDLESIZE + 1)-th character. */
  422. if (np < eon) {
  423. /* haystack is smaller than needle, :O */
  424. return NULL;
  425. } else if (eqp) {
  426. /* found a match */
  427. return deconst(hay);
  428. }
  429. /* now loop through the rest of haystack,
  430. * updating the sum iteratively */
  431. for (cand = hay; hp < eoh; hp++) {
  432. hsum ^= *cand++;
  433. hsum ^= *hp;
  434. /* Since the sum of the characters is already known to be
  435. * equal at that point, it is enough to check just NEEDLESIZE - 1
  436. * characters for equality,
  437. * also CAND is by design < HP, so no need for range checks */
  438. if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
  439. return deconst(cand);
  440. }
  441. }
  442. return NULL;
  443. }
  444. static int
  445. strtoi_lim(const char *str, const char **ep, int llim, int ulim)
  446. {
  447. int res = 0;
  448. const char *sp;
  449. /* we keep track of the number of digits via rulim */
  450. int rulim;
  451. for (sp = str, rulim = ulim > 10 ? ulim : 10;
  452. res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
  453. sp++, rulim /= 10) {
  454. res *= 10;
  455. res += *sp - '0';
  456. }
  457. if (sp == str) {
  458. res = -1;
  459. } else if (res < llim || res > ulim) {
  460. res = -2;
  461. }
  462. *ep = (const char*)sp;
  463. return res;
  464. }
  465. static time_t
  466. time_from_tm(struct tm *t)
  467. {
  468. #if HAVE_TIMEGM
  469. /* Use platform timegm() if available. */
  470. return (timegm(t));
  471. #elif HAVE__MKGMTIME64
  472. return (_mkgmtime64(t));
  473. #else
  474. /* Else use direct calculation using POSIX assumptions. */
  475. /* First, fix up tm_yday based on the year/month/day. */
  476. if (mktime(t) == (time_t)-1)
  477. return ((time_t)-1);
  478. /* Then we can compute timegm() from first principles. */
  479. return (t->tm_sec
  480. + t->tm_min * 60
  481. + t->tm_hour * 3600
  482. + t->tm_yday * 86400
  483. + (t->tm_year - 70) * 31536000
  484. + ((t->tm_year - 69) / 4) * 86400
  485. - ((t->tm_year - 1) / 100) * 86400
  486. + ((t->tm_year + 299) / 400) * 86400);
  487. #endif
  488. }
  489. static time_t
  490. xstrpisotime(const char *s, char **endptr)
  491. {
  492. /** like strptime() but strictly for ISO 8601 Zulu strings */
  493. struct tm tm;
  494. time_t res = (time_t)-1;
  495. /* make sure tm is clean */
  496. memset(&tm, 0, sizeof(tm));
  497. /* as a courtesy to our callers, and since this is a non-standard
  498. * routine, we skip leading whitespace */
  499. while (*s == ' ' || *s == '\t')
  500. ++s;
  501. /* read year */
  502. if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
  503. goto out;
  504. }
  505. /* read month */
  506. if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
  507. goto out;
  508. }
  509. /* read day-of-month */
  510. if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
  511. goto out;
  512. }
  513. /* read hour */
  514. if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
  515. goto out;
  516. }
  517. /* read minute */
  518. if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
  519. goto out;
  520. }
  521. /* read second */
  522. if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
  523. goto out;
  524. }
  525. /* massage TM to fulfill some of POSIX' constraints */
  526. tm.tm_year -= 1900;
  527. tm.tm_mon--;
  528. /* now convert our custom tm struct to a unix stamp using UTC */
  529. res = time_from_tm(&tm);
  530. out:
  531. if (endptr != NULL) {
  532. *endptr = deconst(s);
  533. }
  534. return res;
  535. }
  536. static unsigned int
  537. _warc_rdver(const char *buf, size_t bsz)
  538. {
  539. static const char magic[] = "WARC/";
  540. const char *c;
  541. unsigned int ver = 0U;
  542. unsigned int end = 0U;
  543. if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
  544. /* buffer too small or invalid magic */
  545. return ver;
  546. }
  547. /* looks good so far, read the version number for a laugh */
  548. buf += sizeof(magic) - 1U;
  549. if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
  550. isdigit((unsigned char)buf[2U])) {
  551. /* we support a maximum of 2 digits in the minor version */
  552. if (isdigit((unsigned char)buf[3U]))
  553. end = 1U;
  554. /* set up major version */
  555. ver = (buf[0U] - '0') * 10000U;
  556. /* set up minor version */
  557. if (end == 1U) {
  558. ver += (buf[2U] - '0') * 1000U;
  559. ver += (buf[3U] - '0') * 100U;
  560. } else
  561. ver += (buf[2U] - '0') * 100U;
  562. /*
  563. * WARC below version 0.12 has a space-separated header
  564. * WARC 0.12 and above terminates the version with a CRLF
  565. */
  566. c = buf + 3U + end;
  567. if (ver >= 1200U) {
  568. if (memcmp(c, "\r\n", 2U) != 0)
  569. ver = 0U;
  570. } else {
  571. /* ver < 1200U */
  572. if (*c != ' ' && *c != '\t')
  573. ver = 0U;
  574. }
  575. }
  576. return ver;
  577. }
  578. static unsigned int
  579. _warc_rdtyp(const char *buf, size_t bsz)
  580. {
  581. static const char _key[] = "\r\nWARC-Type:";
  582. const char *val, *eol;
  583. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  584. /* no bother */
  585. return WT_NONE;
  586. }
  587. val += sizeof(_key) - 1U;
  588. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
  589. /* no end of line */
  590. return WT_NONE;
  591. }
  592. /* overread whitespace */
  593. while (val < eol && (*val == ' ' || *val == '\t'))
  594. ++val;
  595. if (val + 8U == eol) {
  596. if (memcmp(val, "resource", 8U) == 0)
  597. return WT_RSRC;
  598. else if (memcmp(val, "response", 8U) == 0)
  599. return WT_RSP;
  600. }
  601. return WT_NONE;
  602. }
  603. static warc_string_t
  604. _warc_rduri(const char *buf, size_t bsz)
  605. {
  606. static const char _key[] = "\r\nWARC-Target-URI:";
  607. const char *val, *uri, *eol, *p;
  608. warc_string_t res = {0U, NULL};
  609. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  610. /* no bother */
  611. return res;
  612. }
  613. /* overread whitespace */
  614. val += sizeof(_key) - 1U;
  615. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
  616. /* no end of line */
  617. return res;
  618. }
  619. while (val < eol && (*val == ' ' || *val == '\t'))
  620. ++val;
  621. /* overread URL designators */
  622. if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
  623. /* not touching that! */
  624. return res;
  625. }
  626. /* spaces inside uri are not allowed, CRLF should follow */
  627. for (p = val; p < eol; p++) {
  628. if (isspace((unsigned char)*p))
  629. return res;
  630. }
  631. /* there must be at least space for ftp */
  632. if (uri < (val + 3U))
  633. return res;
  634. /* move uri to point to after :// */
  635. uri += 3U;
  636. /* now then, inspect the URI */
  637. if (memcmp(val, "file", 4U) == 0) {
  638. /* perfect, nothing left to do here */
  639. } else if (memcmp(val, "http", 4U) == 0 ||
  640. memcmp(val, "ftp", 3U) == 0) {
  641. /* overread domain, and the first / */
  642. while (uri < eol && *uri++ != '/');
  643. } else {
  644. /* not sure what to do? best to bugger off */
  645. return res;
  646. }
  647. res.str = uri;
  648. res.len = eol - uri;
  649. return res;
  650. }
  651. static ssize_t
  652. _warc_rdlen(const char *buf, size_t bsz)
  653. {
  654. static const char _key[] = "\r\nContent-Length:";
  655. const char *val, *eol;
  656. char *on = NULL;
  657. long int len;
  658. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  659. /* no bother */
  660. return -1;
  661. }
  662. val += sizeof(_key) - 1U;
  663. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
  664. /* no end of line */
  665. return -1;
  666. }
  667. /* skip leading whitespace */
  668. while (val < eol && (*val == ' ' || *val == '\t'))
  669. val++;
  670. /* there must be at least one digit */
  671. if (!isdigit((unsigned char)*val))
  672. return -1;
  673. errno = 0;
  674. len = strtol(val, &on, 10);
  675. if (errno != 0 || on != eol) {
  676. /* line must end here */
  677. return -1;
  678. }
  679. return (size_t)len;
  680. }
  681. static time_t
  682. _warc_rdrtm(const char *buf, size_t bsz)
  683. {
  684. static const char _key[] = "\r\nWARC-Date:";
  685. const char *val, *eol;
  686. char *on = NULL;
  687. time_t res;
  688. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  689. /* no bother */
  690. return (time_t)-1;
  691. }
  692. val += sizeof(_key) - 1U;
  693. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
  694. /* no end of line */
  695. return -1;
  696. }
  697. /* xstrpisotime() kindly overreads whitespace for us, so use that */
  698. res = xstrpisotime(val, &on);
  699. if (on != eol) {
  700. /* line must end here */
  701. return -1;
  702. }
  703. return res;
  704. }
  705. static time_t
  706. _warc_rdmtm(const char *buf, size_t bsz)
  707. {
  708. static const char _key[] = "\r\nLast-Modified:";
  709. const char *val, *eol;
  710. char *on = NULL;
  711. time_t res;
  712. if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
  713. /* no bother */
  714. return (time_t)-1;
  715. }
  716. val += sizeof(_key) - 1U;
  717. if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
  718. /* no end of line */
  719. return -1;
  720. }
  721. /* xstrpisotime() kindly overreads whitespace for us, so use that */
  722. res = xstrpisotime(val, &on);
  723. if (on != eol) {
  724. /* line must end here */
  725. return -1;
  726. }
  727. return res;
  728. }
  729. static const char*
  730. _warc_find_eoh(const char *buf, size_t bsz)
  731. {
  732. static const char _marker[] = "\r\n\r\n";
  733. const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
  734. if (hit != NULL) {
  735. hit += sizeof(_marker) - 1U;
  736. }
  737. return hit;
  738. }
  739. static const char*
  740. _warc_find_eol(const char *buf, size_t bsz)
  741. {
  742. static const char _marker[] = "\r\n";
  743. const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
  744. return hit;
  745. }
  746. /* archive_read_support_format_warc.c ends here */