123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858 |
- /*-
- * Copyright (c) 2014 Sebastian Freundt
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- #include "archive_platform.h"
- /**
- * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
- * ISO 28500:2009.
- * For the purposes of this file we used the final draft from:
- * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
- *
- * Todo:
- * [ ] real-world warcs can contain resources at endpoints ending in /
- * e.g. http://bibnum.bnf.fr/warc/
- * if you're lucky their response contains a Content-Location: header
- * pointing to a unix-compliant filename, in the example above it's
- * Content-Location: http://bibnum.bnf.fr/warc/index.html
- * however, that's not mandated and github for example doesn't follow
- * this convention.
- * We need a set of archive options to control what to do with
- * entries like these, at the moment care is taken to skip them.
- *
- **/
- #ifdef HAVE_SYS_STAT_H
- #include <sys/stat.h>
- #endif
- #ifdef HAVE_ERRNO_H
- #include <errno.h>
- #endif
- #ifdef HAVE_STDLIB_H
- #include <stdlib.h>
- #endif
- #ifdef HAVE_STRING_H
- #include <string.h>
- #endif
- #ifdef HAVE_LIMITS_H
- #include <limits.h>
- #endif
- #ifdef HAVE_CTYPE_H
- #include <ctype.h>
- #endif
- #ifdef HAVE_TIME_H
- #include <time.h>
- #endif
- #include "archive.h"
- #include "archive_entry.h"
- #include "archive_private.h"
- #include "archive_read_private.h"
- typedef enum {
- WT_NONE,
- /* warcinfo */
- WT_INFO,
- /* metadata */
- WT_META,
- /* resource */
- WT_RSRC,
- /* request, unsupported */
- WT_REQ,
- /* response, unsupported */
- WT_RSP,
- /* revisit, unsupported */
- WT_RVIS,
- /* conversion, unsupported */
- WT_CONV,
- /* continuation, unsupported at the moment */
- WT_CONT,
- /* invalid type */
- LAST_WT
- } warc_type_t;
- typedef struct {
- size_t len;
- const char *str;
- } warc_string_t;
- typedef struct {
- size_t len;
- char *str;
- } warc_strbuf_t;
- struct warc_s {
- /* content length ahead */
- size_t cntlen;
- /* and how much we've processed so far */
- size_t cntoff;
- /* and how much we need to consume between calls */
- size_t unconsumed;
- /* string pool */
- warc_strbuf_t pool;
- /* previous version */
- unsigned int pver;
- /* stringified format name */
- struct archive_string sver;
- };
- static int _warc_bid(struct archive_read *a, int);
- static int _warc_cleanup(struct archive_read *a);
- static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
- static int _warc_skip(struct archive_read *a);
- static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
- /* private routines */
- static unsigned int _warc_rdver(const char *buf, size_t bsz);
- static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
- static warc_string_t _warc_rduri(const char *buf, size_t bsz);
- static ssize_t _warc_rdlen(const char *buf, size_t bsz);
- static time_t _warc_rdrtm(const char *buf, size_t bsz);
- static time_t _warc_rdmtm(const char *buf, size_t bsz);
- static const char *_warc_find_eoh(const char *buf, size_t bsz);
- static const char *_warc_find_eol(const char *buf, size_t bsz);
- int
- archive_read_support_format_warc(struct archive *_a)
- {
- struct archive_read *a = (struct archive_read *)_a;
- struct warc_s *w;
- int r;
- archive_check_magic(_a, ARCHIVE_READ_MAGIC,
- ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
- if ((w = calloc(1, sizeof(*w))) == NULL) {
- archive_set_error(&a->archive, ENOMEM,
- "Can't allocate warc data");
- return (ARCHIVE_FATAL);
- }
- r = __archive_read_register_format(
- a, w, "warc",
- _warc_bid, NULL, _warc_rdhdr, _warc_read,
- _warc_skip, NULL, _warc_cleanup, NULL, NULL);
- if (r != ARCHIVE_OK) {
- free(w);
- return (r);
- }
- return (ARCHIVE_OK);
- }
- static int
- _warc_cleanup(struct archive_read *a)
- {
- struct warc_s *w = a->format->data;
- if (w->pool.len > 0U) {
- free(w->pool.str);
- }
- archive_string_free(&w->sver);
- free(w);
- a->format->data = NULL;
- return (ARCHIVE_OK);
- }
- static int
- _warc_bid(struct archive_read *a, int best_bid)
- {
- const char *hdr;
- ssize_t nrd;
- unsigned int ver;
- (void)best_bid; /* UNUSED */
- /* check first line of file, it should be a record already */
- if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
- /* no idea what to do */
- return -1;
- } else if (nrd < 12) {
- /* nah, not for us, our magic cookie is at least 12 bytes */
- return -1;
- }
- /* otherwise snarf the record's version number */
- ver = _warc_rdver(hdr, nrd);
- if (ver < 1200U || ver > 10000U) {
- /* we only support WARC 0.12 to 1.0 */
- return -1;
- }
- /* otherwise be confident */
- return (64);
- }
- static int
- _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
- {
- #define HDR_PROBE_LEN (12U)
- struct warc_s *w = a->format->data;
- unsigned int ver;
- const char *buf;
- ssize_t nrd;
- const char *eoh;
- char *tmp;
- /* for the file name, saves some strndup()'ing */
- warc_string_t fnam;
- /* warc record type, not that we really use it a lot */
- warc_type_t ftyp;
- /* content-length+error monad */
- ssize_t cntlen;
- /* record time is the WARC-Date time we reinterpret it as ctime */
- time_t rtime;
- /* mtime is the Last-Modified time which will be the entry's mtime */
- time_t mtime;
- start_over:
- /* just use read_ahead() they keep track of unconsumed
- * bits and bobs for us; no need to put an extra shift in
- * and reproduce that functionality here */
- buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
- if (nrd < 0) {
- /* no good */
- archive_set_error(
- &a->archive, ARCHIVE_ERRNO_MISC,
- "Bad record header");
- return (ARCHIVE_FATAL);
- } else if (buf == NULL) {
- /* there should be room for at least WARC/bla\r\n
- * must be EOF therefore */
- return (ARCHIVE_EOF);
- }
- /* looks good so far, try and find the end of the header now */
- eoh = _warc_find_eoh(buf, nrd);
- if (eoh == NULL) {
- /* still no good, the header end might be beyond the
- * probe we've requested, but then again who'd cram
- * so much stuff into the header *and* be 28500-compliant */
- archive_set_error(
- &a->archive, ARCHIVE_ERRNO_MISC,
- "Bad record header");
- return (ARCHIVE_FATAL);
- }
- ver = _warc_rdver(buf, eoh - buf);
- /* we currently support WARC 0.12 to 1.0 */
- if (ver == 0U) {
- archive_set_error(
- &a->archive, ARCHIVE_ERRNO_MISC,
- "Invalid record version");
- return (ARCHIVE_FATAL);
- } else if (ver < 1200U || ver > 10000U) {
- archive_set_error(
- &a->archive, ARCHIVE_ERRNO_MISC,
- "Unsupported record version: %u.%u",
- ver / 10000, (ver % 10000) / 100);
- return (ARCHIVE_FATAL);
- }
- cntlen = _warc_rdlen(buf, eoh - buf);
- if (cntlen < 0) {
- /* nightmare! the specs say content-length is mandatory
- * so I don't feel overly bad stopping the reader here */
- archive_set_error(
- &a->archive, EINVAL,
- "Bad content length");
- return (ARCHIVE_FATAL);
- }
- rtime = _warc_rdrtm(buf, eoh - buf);
- if (rtime == (time_t)-1) {
- /* record time is mandatory as per WARC/1.0,
- * so just barf here, fast and loud */
- archive_set_error(
- &a->archive, EINVAL,
- "Bad record time");
- return (ARCHIVE_FATAL);
- }
- /* let the world know we're a WARC archive */
- a->archive.archive_format = ARCHIVE_FORMAT_WARC;
- if (ver != w->pver) {
- /* stringify this entry's version */
- archive_string_sprintf(&w->sver,
- "WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
- /* remember the version */
- w->pver = ver;
- }
- /* start off with the type */
- ftyp = _warc_rdtyp(buf, eoh - buf);
- /* and let future calls know about the content */
- w->cntlen = cntlen;
- w->cntoff = 0U;
- mtime = 0;/* Avoid compiling error on some platform. */
- switch (ftyp) {
- case WT_RSRC:
- case WT_RSP:
- /* only try and read the filename in the cases that are
- * guaranteed to have one */
- fnam = _warc_rduri(buf, eoh - buf);
- /* check the last character in the URI to avoid creating
- * directory endpoints as files, see Todo above */
- if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
- /* break here for now */
- fnam.len = 0U;
- fnam.str = NULL;
- break;
- }
- /* bang to our string pool, so we save a
- * malloc()+free() roundtrip */
- if (fnam.len + 1U > w->pool.len) {
- w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
- tmp = realloc(w->pool.str, w->pool.len);
- if (tmp == NULL) {
- archive_set_error(
- &a->archive, ENOMEM,
- "Out of memory");
- return (ARCHIVE_FATAL);
- }
- w->pool.str = tmp;
- }
- memcpy(w->pool.str, fnam.str, fnam.len);
- w->pool.str[fnam.len] = '\0';
- /* let no one else know about the pool, it's a secret, shhh */
- fnam.str = w->pool.str;
- /* snarf mtime or deduce from rtime
- * this is a custom header added by our writer, it's quite
- * hard to believe anyone else would go through with it
- * (apart from being part of some http responses of course) */
- if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
- mtime = rtime;
- }
- break;
- case WT_NONE:
- case WT_INFO:
- case WT_META:
- case WT_REQ:
- case WT_RVIS:
- case WT_CONV:
- case WT_CONT:
- case LAST_WT:
- default:
- fnam.len = 0U;
- fnam.str = NULL;
- break;
- }
- /* now eat some of those delicious buffer bits */
- __archive_read_consume(a, eoh - buf);
- switch (ftyp) {
- case WT_RSRC:
- case WT_RSP:
- if (fnam.len > 0U) {
- /* populate entry object */
- archive_entry_set_filetype(entry, AE_IFREG);
- archive_entry_copy_pathname(entry, fnam.str);
- archive_entry_set_size(entry, cntlen);
- archive_entry_set_perm(entry, 0644);
- /* rtime is the new ctime, mtime stays mtime */
- archive_entry_set_ctime(entry, rtime, 0L);
- archive_entry_set_mtime(entry, mtime, 0L);
- break;
- }
- /* FALLTHROUGH */
- case WT_NONE:
- case WT_INFO:
- case WT_META:
- case WT_REQ:
- case WT_RVIS:
- case WT_CONV:
- case WT_CONT:
- case LAST_WT:
- default:
- /* consume the content and start over */
- if (_warc_skip(a) < 0)
- return (ARCHIVE_FATAL);
- goto start_over;
- }
- return (ARCHIVE_OK);
- }
- static int
- _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
- {
- struct warc_s *w = a->format->data;
- const char *rab;
- ssize_t nrd;
- if (w->cntoff >= w->cntlen) {
- eof:
- /* it's our lucky day, no work, we can leave early */
- *buf = NULL;
- *bsz = 0U;
- *off = w->cntoff + 4U/*for \r\n\r\n separator*/;
- w->unconsumed = 0U;
- return (ARCHIVE_EOF);
- }
- if (w->unconsumed) {
- __archive_read_consume(a, w->unconsumed);
- w->unconsumed = 0U;
- }
- rab = __archive_read_ahead(a, 1U, &nrd);
- if (nrd < 0) {
- *bsz = 0U;
- /* big catastrophe */
- return (int)nrd;
- } else if (nrd == 0) {
- goto eof;
- } else if ((size_t)nrd > w->cntlen - w->cntoff) {
- /* clamp to content-length */
- nrd = w->cntlen - w->cntoff;
- }
- *off = w->cntoff;
- *bsz = nrd;
- *buf = rab;
- w->cntoff += nrd;
- w->unconsumed = (size_t)nrd;
- return (ARCHIVE_OK);
- }
- static int
- _warc_skip(struct archive_read *a)
- {
- struct warc_s *w = a->format->data;
- if (__archive_read_consume(a, w->cntlen) < 0 ||
- __archive_read_consume(a, 4U/*\r\n\r\n separator*/) < 0)
- return (ARCHIVE_FATAL);
- w->cntlen = 0U;
- w->cntoff = 0U;
- return (ARCHIVE_OK);
- }
- /* private routines */
- static void*
- deconst(const void *c)
- {
- return (void *)(uintptr_t)c;
- }
- static char*
- xmemmem(const char *hay, const size_t haysize,
- const char *needle, const size_t needlesize)
- {
- const char *const eoh = hay + haysize;
- const char *const eon = needle + needlesize;
- const char *hp;
- const char *np;
- const char *cand;
- unsigned int hsum;
- unsigned int nsum;
- unsigned int eqp;
- /* trivial checks first
- * a 0-sized needle is defined to be found anywhere in haystack
- * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
- * that happens to begin with *NEEDLE) */
- if (needlesize == 0UL) {
- return deconst(hay);
- } else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
- /* trivial */
- return NULL;
- }
- /* First characters of haystack and needle are the same now. Both are
- * guaranteed to be at least one character long. Now computes the sum
- * of characters values of needle together with the sum of the first
- * needle_len characters of haystack. */
- for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
- hp < eoh && np < eon;
- hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
- /* HP now references the (NEEDLESIZE + 1)-th character. */
- if (np < eon) {
- /* haystack is smaller than needle, :O */
- return NULL;
- } else if (eqp) {
- /* found a match */
- return deconst(hay);
- }
- /* now loop through the rest of haystack,
- * updating the sum iteratively */
- for (cand = hay; hp < eoh; hp++) {
- hsum ^= *cand++;
- hsum ^= *hp;
- /* Since the sum of the characters is already known to be
- * equal at that point, it is enough to check just NEEDLESIZE - 1
- * characters for equality,
- * also CAND is by design < HP, so no need for range checks */
- if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
- return deconst(cand);
- }
- }
- return NULL;
- }
- static int
- strtoi_lim(const char *str, const char **ep, int llim, int ulim)
- {
- int res = 0;
- const char *sp;
- /* we keep track of the number of digits via rulim */
- int rulim;
- for (sp = str, rulim = ulim > 10 ? ulim : 10;
- res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
- sp++, rulim /= 10) {
- res *= 10;
- res += *sp - '0';
- }
- if (sp == str) {
- res = -1;
- } else if (res < llim || res > ulim) {
- res = -2;
- }
- *ep = (const char*)sp;
- return res;
- }
- static time_t
- time_from_tm(struct tm *t)
- {
- #if HAVE__MKGMTIME
- return _mkgmtime(t);
- #elif HAVE_TIMEGM
- /* Use platform timegm() if available. */
- return (timegm(t));
- #else
- /* Else use direct calculation using POSIX assumptions. */
- /* First, fix up tm_yday based on the year/month/day. */
- if (mktime(t) == (time_t)-1)
- return ((time_t)-1);
- /* Then we can compute timegm() from first principles. */
- return (t->tm_sec
- + t->tm_min * 60
- + t->tm_hour * 3600
- + t->tm_yday * 86400
- + (t->tm_year - 70) * 31536000
- + ((t->tm_year - 69) / 4) * 86400
- - ((t->tm_year - 1) / 100) * 86400
- + ((t->tm_year + 299) / 400) * 86400);
- #endif
- }
- static time_t
- xstrpisotime(const char *s, char **endptr)
- {
- /** like strptime() but strictly for ISO 8601 Zulu strings */
- struct tm tm;
- time_t res = (time_t)-1;
- /* make sure tm is clean */
- memset(&tm, 0, sizeof(tm));
- /* as a courtesy to our callers, and since this is a non-standard
- * routine, we skip leading whitespace */
- while (*s == ' ' || *s == '\t')
- ++s;
- /* read year */
- if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
- goto out;
- }
- /* read month */
- if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
- goto out;
- }
- /* read day-of-month */
- if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
- goto out;
- }
- /* read hour */
- if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
- goto out;
- }
- /* read minute */
- if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
- goto out;
- }
- /* read second */
- if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
- goto out;
- }
- /* massage TM to fulfill some of POSIX' constraints */
- tm.tm_year -= 1900;
- tm.tm_mon--;
- /* now convert our custom tm struct to a unix stamp using UTC */
- res = time_from_tm(&tm);
- out:
- if (endptr != NULL) {
- *endptr = deconst(s);
- }
- return res;
- }
- static unsigned int
- _warc_rdver(const char *buf, size_t bsz)
- {
- static const char magic[] = "WARC/";
- const char *c;
- unsigned int ver = 0U;
- unsigned int end = 0U;
- if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
- /* buffer too small or invalid magic */
- return ver;
- }
- /* looks good so far, read the version number for a laugh */
- buf += sizeof(magic) - 1U;
- if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
- isdigit((unsigned char)buf[2U])) {
- /* we support a maximum of 2 digits in the minor version */
- if (isdigit((unsigned char)buf[3U]))
- end = 1U;
- /* set up major version */
- ver = (buf[0U] - '0') * 10000U;
- /* set up minor version */
- if (end == 1U) {
- ver += (buf[2U] - '0') * 1000U;
- ver += (buf[3U] - '0') * 100U;
- } else
- ver += (buf[2U] - '0') * 100U;
- /*
- * WARC below version 0.12 has a space-separated header
- * WARC 0.12 and above terminates the version with a CRLF
- */
- c = buf + 3U + end;
- if (ver >= 1200U) {
- if (memcmp(c, "\r\n", 2U) != 0)
- ver = 0U;
- } else {
- /* ver < 1200U */
- if (*c != ' ' && *c != '\t')
- ver = 0U;
- }
- }
- return ver;
- }
- static unsigned int
- _warc_rdtyp(const char *buf, size_t bsz)
- {
- static const char _key[] = "\r\nWARC-Type:";
- const char *val, *eol;
- if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
- /* no bother */
- return WT_NONE;
- }
- val += sizeof(_key) - 1U;
- if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
- /* no end of line */
- return WT_NONE;
- }
- /* overread whitespace */
- while (val < eol && (*val == ' ' || *val == '\t'))
- ++val;
- if (val + 8U == eol) {
- if (memcmp(val, "resource", 8U) == 0)
- return WT_RSRC;
- else if (memcmp(val, "response", 8U) == 0)
- return WT_RSP;
- }
- return WT_NONE;
- }
- static warc_string_t
- _warc_rduri(const char *buf, size_t bsz)
- {
- static const char _key[] = "\r\nWARC-Target-URI:";
- const char *val, *uri, *eol, *p;
- warc_string_t res = {0U, NULL};
- if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
- /* no bother */
- return res;
- }
- /* overread whitespace */
- val += sizeof(_key) - 1U;
- if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
- /* no end of line */
- return res;
- }
- while (val < eol && (*val == ' ' || *val == '\t'))
- ++val;
- /* overread URL designators */
- if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
- /* not touching that! */
- return res;
- }
- /* spaces inside uri are not allowed, CRLF should follow */
- for (p = val; p < eol; p++) {
- if (isspace((unsigned char)*p))
- return res;
- }
- /* there must be at least space for ftp */
- if (uri < (val + 3U))
- return res;
- /* move uri to point to after :// */
- uri += 3U;
- /* now then, inspect the URI */
- if (memcmp(val, "file", 4U) == 0) {
- /* perfect, nothing left to do here */
- } else if (memcmp(val, "http", 4U) == 0 ||
- memcmp(val, "ftp", 3U) == 0) {
- /* overread domain, and the first / */
- while (uri < eol && *uri++ != '/');
- } else {
- /* not sure what to do? best to bugger off */
- return res;
- }
- res.str = uri;
- res.len = eol - uri;
- return res;
- }
- static ssize_t
- _warc_rdlen(const char *buf, size_t bsz)
- {
- static const char _key[] = "\r\nContent-Length:";
- const char *val, *eol;
- char *on = NULL;
- long int len;
- if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
- /* no bother */
- return -1;
- }
- val += sizeof(_key) - 1U;
- if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
- /* no end of line */
- return -1;
- }
- /* skip leading whitespace */
- while (val < eol && (*val == ' ' || *val == '\t'))
- val++;
- /* there must be at least one digit */
- if (!isdigit((unsigned char)*val))
- return -1;
- errno = 0;
- len = strtol(val, &on, 10);
- if (errno != 0 || on != eol) {
- /* line must end here */
- return -1;
- }
- return (size_t)len;
- }
- static time_t
- _warc_rdrtm(const char *buf, size_t bsz)
- {
- static const char _key[] = "\r\nWARC-Date:";
- const char *val, *eol;
- char *on = NULL;
- time_t res;
- if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
- /* no bother */
- return (time_t)-1;
- }
- val += sizeof(_key) - 1U;
- if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
- /* no end of line */
- return -1;
- }
- /* xstrpisotime() kindly overreads whitespace for us, so use that */
- res = xstrpisotime(val, &on);
- if (on != eol) {
- /* line must end here */
- return -1;
- }
- return res;
- }
- static time_t
- _warc_rdmtm(const char *buf, size_t bsz)
- {
- static const char _key[] = "\r\nLast-Modified:";
- const char *val, *eol;
- char *on = NULL;
- time_t res;
- if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
- /* no bother */
- return (time_t)-1;
- }
- val += sizeof(_key) - 1U;
- if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
- /* no end of line */
- return -1;
- }
- /* xstrpisotime() kindly overreads whitespace for us, so use that */
- res = xstrpisotime(val, &on);
- if (on != eol) {
- /* line must end here */
- return -1;
- }
- return res;
- }
- static const char*
- _warc_find_eoh(const char *buf, size_t bsz)
- {
- static const char _marker[] = "\r\n\r\n";
- const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
- if (hit != NULL) {
- hit += sizeof(_marker) - 1U;
- }
- return hit;
- }
- static const char*
- _warc_find_eol(const char *buf, size_t bsz)
- {
- static const char _marker[] = "\r\n";
- const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
- return hit;
- }
- /* archive_read_support_format_warc.c ends here */
|