archive_write_set_format_warc.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. /*-
  2. * Copyright (c) 2014 Sebastian Freundt
  3. * Author: Sebastian Freundt <[email protected]>
  4. *
  5. * All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions
  9. * are met:
  10. * 1. Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * 2. Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in the
  14. * documentation and/or other materials provided with the distribution.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  17. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  20. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  21. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  25. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #include "archive_platform.h"
  28. __FBSDID("$FreeBSD$");
  29. #ifdef HAVE_ERRNO_H
  30. #include <errno.h>
  31. #endif
  32. #include <stdio.h>
  33. #ifdef HAVE_STDLIB_H
  34. #include <stdlib.h>
  35. #endif
  36. #ifdef HAVE_STRING_H
  37. #include <string.h>
  38. #endif
  39. #ifdef HAVE_TIME_H
  40. #include <time.h>
  41. #endif
  42. #include "archive.h"
  43. #include "archive_entry.h"
  44. #include "archive_entry_locale.h"
  45. #include "archive_private.h"
  46. #include "archive_random_private.h"
  47. #include "archive_write_private.h"
  48. struct warc_s {
  49. unsigned int omit_warcinfo:1;
  50. time_t now;
  51. mode_t typ;
  52. unsigned int rng;
  53. /* populated size */
  54. uint64_t populz;
  55. };
  56. static const char warcinfo[] =
  57. "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
  58. "format: WARC file version 1.0\r\n";
  59. typedef enum {
  60. WT_NONE,
  61. /* warcinfo */
  62. WT_INFO,
  63. /* metadata */
  64. WT_META,
  65. /* resource */
  66. WT_RSRC,
  67. /* request, unsupported */
  68. WT_REQ,
  69. /* response, unsupported */
  70. WT_RSP,
  71. /* revisit, unsupported */
  72. WT_RVIS,
  73. /* conversion, unsupported */
  74. WT_CONV,
  75. /* continuation, unsupported at the moment */
  76. WT_CONT,
  77. /* invalid type */
  78. LAST_WT
  79. } warc_type_t;
  80. typedef struct {
  81. warc_type_t type;
  82. const char *tgturi;
  83. const char *recid;
  84. time_t rtime;
  85. time_t mtime;
  86. const char *cnttyp;
  87. uint64_t cntlen;
  88. } warc_essential_hdr_t;
  89. typedef struct {
  90. unsigned int u[4U];
  91. } warc_uuid_t;
  92. static int _warc_options(struct archive_write*, const char *key, const char *v);
  93. static int _warc_header(struct archive_write *a, struct archive_entry *entry);
  94. static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
  95. static int _warc_finish_entry(struct archive_write *a);
  96. static int _warc_close(struct archive_write *a);
  97. static int _warc_free(struct archive_write *a);
  98. /* private routines */
  99. static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
  100. static int _gen_uuid(warc_uuid_t *tgt);
  101. /*
  102. * Set output format to ISO 28500 (aka WARC) format.
  103. */
  104. int
  105. archive_write_set_format_warc(struct archive *_a)
  106. {
  107. struct archive_write *a = (struct archive_write *)_a;
  108. struct warc_s *w;
  109. archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
  110. ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
  111. /* If another format was already registered, unregister it. */
  112. if (a->format_free != NULL) {
  113. (a->format_free)(a);
  114. }
  115. w = malloc(sizeof(*w));
  116. if (w == NULL) {
  117. archive_set_error(&a->archive, ENOMEM,
  118. "Can't allocate warc data");
  119. return (ARCHIVE_FATAL);
  120. }
  121. /* by default we're emitting a file wide header */
  122. w->omit_warcinfo = 0U;
  123. /* obtain current time for date fields */
  124. w->now = time(NULL);
  125. /* reset file type info */
  126. w->typ = 0;
  127. /* also initialise our rng */
  128. w->rng = (unsigned int)w->now;
  129. a->format_data = w;
  130. a->format_name = "WARC/1.0";
  131. a->format_options = _warc_options;
  132. a->format_write_header = _warc_header;
  133. a->format_write_data = _warc_data;
  134. a->format_close = _warc_close;
  135. a->format_free = _warc_free;
  136. a->format_finish_entry = _warc_finish_entry;
  137. a->archive.archive_format = ARCHIVE_FORMAT_WARC;
  138. a->archive.archive_format_name = "WARC/1.0";
  139. return (ARCHIVE_OK);
  140. }
  141. /* archive methods */
  142. static int
  143. _warc_options(struct archive_write *a, const char *key, const char *val)
  144. {
  145. struct warc_s *w = a->format_data;
  146. if (strcmp(key, "omit-warcinfo") == 0) {
  147. if (val == NULL || strcmp(val, "true") == 0) {
  148. /* great */
  149. w->omit_warcinfo = 1U;
  150. return (ARCHIVE_OK);
  151. }
  152. }
  153. /* Note: The "warn" return is just to inform the options
  154. * supervisor that we didn't handle it. It will generate
  155. * a suitable error if no one used this option. */
  156. return (ARCHIVE_WARN);
  157. }
  158. static int
  159. _warc_header(struct archive_write *a, struct archive_entry *entry)
  160. {
  161. struct warc_s *w = a->format_data;
  162. struct archive_string hdr;
  163. #define MAX_HDR_SIZE 512
  164. /* check whether warcinfo record needs outputting */
  165. if (!w->omit_warcinfo) {
  166. ssize_t r;
  167. warc_essential_hdr_t wi = {
  168. WT_INFO,
  169. /*uri*/NULL,
  170. /*urn*/NULL,
  171. /*rtm*/0,
  172. /*mtm*/0,
  173. /*cty*/"application/warc-fields",
  174. /*len*/sizeof(warcinfo) - 1U,
  175. };
  176. wi.rtime = w->now;
  177. wi.mtime = w->now;
  178. archive_string_init(&hdr);
  179. r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
  180. if (r >= 0) {
  181. /* jackpot! */
  182. /* now also use HDR buffer for the actual warcinfo */
  183. archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
  184. /* append end-of-record indicator */
  185. archive_strncat(&hdr, "\r\n\r\n", 4);
  186. /* write to output stream */
  187. __archive_write_output(a, hdr.s, archive_strlen(&hdr));
  188. }
  189. /* indicate we're done with file header writing */
  190. w->omit_warcinfo = 1U;
  191. archive_string_free(&hdr);
  192. }
  193. if (archive_entry_pathname(entry) == NULL) {
  194. archive_set_error(&a->archive, EINVAL,
  195. "Invalid filename");
  196. return (ARCHIVE_WARN);
  197. }
  198. w->typ = archive_entry_filetype(entry);
  199. w->populz = 0U;
  200. if (w->typ == AE_IFREG) {
  201. warc_essential_hdr_t rh = {
  202. WT_RSRC,
  203. /*uri*/NULL,
  204. /*urn*/NULL,
  205. /*rtm*/0,
  206. /*mtm*/0,
  207. /*cty*/NULL,
  208. /*len*/0,
  209. };
  210. ssize_t r;
  211. rh.tgturi = archive_entry_pathname(entry);
  212. rh.rtime = w->now;
  213. rh.mtime = archive_entry_mtime(entry);
  214. rh.cntlen = (size_t)archive_entry_size(entry);
  215. archive_string_init(&hdr);
  216. r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
  217. if (r < 0) {
  218. /* don't bother */
  219. archive_set_error(
  220. &a->archive,
  221. ARCHIVE_ERRNO_FILE_FORMAT,
  222. "cannot archive file");
  223. return (ARCHIVE_WARN);
  224. }
  225. /* otherwise append to output stream */
  226. __archive_write_output(a, hdr.s, r);
  227. /* and let subsequent calls to _data() know about the size */
  228. w->populz = rh.cntlen;
  229. archive_string_free(&hdr);
  230. return (ARCHIVE_OK);
  231. }
  232. /* just resort to erroring as per Tim's advice */
  233. archive_set_error(
  234. &a->archive,
  235. ARCHIVE_ERRNO_FILE_FORMAT,
  236. "WARC can only process regular files");
  237. return (ARCHIVE_FAILED);
  238. }
  239. static ssize_t
  240. _warc_data(struct archive_write *a, const void *buf, size_t len)
  241. {
  242. struct warc_s *w = a->format_data;
  243. if (w->typ == AE_IFREG) {
  244. int rc;
  245. /* never write more bytes than announced */
  246. if (len > w->populz) {
  247. len = (size_t)w->populz;
  248. }
  249. /* now then, out we put the whole shebang */
  250. rc = __archive_write_output(a, buf, len);
  251. if (rc != ARCHIVE_OK) {
  252. return rc;
  253. }
  254. }
  255. return len;
  256. }
  257. static int
  258. _warc_finish_entry(struct archive_write *a)
  259. {
  260. static const char _eor[] = "\r\n\r\n";
  261. struct warc_s *w = a->format_data;
  262. if (w->typ == AE_IFREG) {
  263. int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
  264. if (rc != ARCHIVE_OK) {
  265. return rc;
  266. }
  267. }
  268. /* reset type info */
  269. w->typ = 0;
  270. return (ARCHIVE_OK);
  271. }
  272. static int
  273. _warc_close(struct archive_write *a)
  274. {
  275. (void)a; /* UNUSED */
  276. return (ARCHIVE_OK);
  277. }
  278. static int
  279. _warc_free(struct archive_write *a)
  280. {
  281. struct warc_s *w = a->format_data;
  282. free(w);
  283. a->format_data = NULL;
  284. return (ARCHIVE_OK);
  285. }
  286. /* private routines */
  287. static void
  288. xstrftime(struct archive_string *as, const char *fmt, time_t t)
  289. {
  290. /** like strftime(3) but for time_t objects */
  291. struct tm *rt;
  292. #if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S)
  293. struct tm timeHere;
  294. #endif
  295. char strtime[100];
  296. size_t len;
  297. #ifdef HAVE_GMTIME_R
  298. if ((rt = gmtime_r(&t, &timeHere)) == NULL)
  299. return;
  300. #elif defined(HAVE__GMTIME64_S)
  301. _gmtime64_s(&timeHere, &t);
  302. #else
  303. if ((rt = gmtime(&t)) == NULL)
  304. return;
  305. #endif
  306. /* leave the hard yacker to our role model strftime() */
  307. len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
  308. archive_strncat(as, strtime, len);
  309. }
  310. static ssize_t
  311. _popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
  312. {
  313. static const char _ver[] = "WARC/1.0\r\n";
  314. static const char * const _typ[LAST_WT] = {
  315. NULL, "warcinfo", "metadata", "resource", NULL
  316. };
  317. char std_uuid[48U];
  318. if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
  319. /* brilliant, how exactly did we get here? */
  320. return -1;
  321. }
  322. archive_strcpy(tgt, _ver);
  323. archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
  324. if (hdr.tgturi != NULL) {
  325. /* check if there's a xyz:// */
  326. static const char _uri[] = "";
  327. static const char _fil[] = "file://";
  328. const char *u;
  329. char *chk = strchr(hdr.tgturi, ':');
  330. if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
  331. /* yep, it's definitely a URI */
  332. u = _uri;
  333. } else {
  334. /* hm, best to prepend file:// then */
  335. u = _fil;
  336. }
  337. archive_string_sprintf(tgt,
  338. "WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
  339. }
  340. /* record time is usually when the http is sent off,
  341. * just treat the archive writing as such for a moment */
  342. xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
  343. /* while we're at it, record the mtime */
  344. xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
  345. if (hdr.recid == NULL) {
  346. /* generate one, grrrr */
  347. warc_uuid_t u;
  348. _gen_uuid(&u);
  349. /* Unfortunately, archive_string_sprintf does not
  350. * handle the minimum number following '%'.
  351. * So we have to use snprintf function here instead
  352. * of archive_string_snprintf function. */
  353. #if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
  354. #define snprintf _snprintf
  355. #endif
  356. snprintf(
  357. std_uuid, sizeof(std_uuid),
  358. "<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
  359. u.u[0U],
  360. u.u[1U] >> 16U, u.u[1U] & 0xffffU,
  361. u.u[2U] >> 16U, u.u[2U] & 0xffffU,
  362. u.u[3U]);
  363. hdr.recid = std_uuid;
  364. }
  365. /* record-id is mandatory, fingers crossed we won't fail */
  366. archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
  367. if (hdr.cnttyp != NULL) {
  368. archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
  369. }
  370. /* next one is mandatory */
  371. archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
  372. /**/
  373. archive_strncat(tgt, "\r\n", 2);
  374. return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
  375. }
  376. static int
  377. _gen_uuid(warc_uuid_t *tgt)
  378. {
  379. archive_random(tgt->u, sizeof(tgt->u));
  380. /* obey uuid version 4 rules */
  381. tgt->u[1U] &= 0xffff0fffU;
  382. tgt->u[1U] |= 0x4000U;
  383. tgt->u[2U] &= 0x3fffffffU;
  384. tgt->u[2U] |= 0x80000000U;
  385. return 0;
  386. }
  387. /* archive_write_set_format_warc.c ends here */