|
|
/*-
|
|
|
* Copyright (c) 2014 Sebastian Freundt
|
|
|
* Author: Sebastian Freundt <devel@fresse.org>
|
|
|
*
|
|
|
* All rights reserved.
|
|
|
*
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
* are met:
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
*
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
|
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
|
* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
*/
|
|
|
|
|
|
#include "archive_platform.h"
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
|
#ifdef HAVE_ERRNO_H
|
|
|
#include <errno.h>
|
|
|
#endif
|
|
|
#include <stdio.h>
|
|
|
#ifdef HAVE_STDLIB_H
|
|
|
#include <stdlib.h>
|
|
|
#endif
|
|
|
#ifdef HAVE_STRING_H
|
|
|
#include <string.h>
|
|
|
#endif
|
|
|
#ifdef HAVE_TIME_H
|
|
|
#include <time.h>
|
|
|
#endif
|
|
|
|
|
|
#include "archive.h"
|
|
|
#include "archive_entry.h"
|
|
|
#include "archive_entry_locale.h"
|
|
|
#include "archive_private.h"
|
|
|
#include "archive_random_private.h"
|
|
|
#include "archive_write_private.h"
|
|
|
|
|
|
struct warc_s {
|
|
|
unsigned int omit_warcinfo:1;
|
|
|
|
|
|
time_t now;
|
|
|
mode_t typ;
|
|
|
unsigned int rng;
|
|
|
/* populated size */
|
|
|
uint64_t populz;
|
|
|
};
|
|
|
|
|
|
static const char warcinfo[] =
|
|
|
"software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
|
|
|
"format: WARC file version 1.0\r\n";
|
|
|
|
|
|
typedef enum {
|
|
|
WT_NONE,
|
|
|
/* warcinfo */
|
|
|
WT_INFO,
|
|
|
/* metadata */
|
|
|
WT_META,
|
|
|
/* resource */
|
|
|
WT_RSRC,
|
|
|
/* request, unsupported */
|
|
|
WT_REQ,
|
|
|
/* response, unsupported */
|
|
|
WT_RSP,
|
|
|
/* revisit, unsupported */
|
|
|
WT_RVIS,
|
|
|
/* conversion, unsupported */
|
|
|
WT_CONV,
|
|
|
/* continuation, unsupported at the moment */
|
|
|
WT_CONT,
|
|
|
/* invalid type */
|
|
|
LAST_WT
|
|
|
} warc_type_t;
|
|
|
|
|
|
typedef struct {
|
|
|
warc_type_t type;
|
|
|
const char *tgturi;
|
|
|
const char *recid;
|
|
|
time_t rtime;
|
|
|
time_t mtime;
|
|
|
const char *cnttyp;
|
|
|
uint64_t cntlen;
|
|
|
} warc_essential_hdr_t;
|
|
|
|
|
|
typedef struct {
|
|
|
unsigned int u[4U];
|
|
|
} warc_uuid_t;
|
|
|
|
|
|
static int _warc_options(struct archive_write*, const char *key, const char *v);
|
|
|
static int _warc_header(struct archive_write *a, struct archive_entry *entry);
|
|
|
static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
|
|
|
static int _warc_finish_entry(struct archive_write *a);
|
|
|
static int _warc_close(struct archive_write *a);
|
|
|
static int _warc_free(struct archive_write *a);
|
|
|
|
|
|
/* private routines */
|
|
|
static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
|
|
|
static int _gen_uuid(warc_uuid_t *tgt);
|
|
|
|
|
|
|
|
|
/*
|
|
|
* Set output format to ISO 28500 (aka WARC) format.
|
|
|
*/
|
|
|
int
|
|
|
archive_write_set_format_warc(struct archive *_a)
|
|
|
{
|
|
|
struct archive_write *a = (struct archive_write *)_a;
|
|
|
struct warc_s *w;
|
|
|
|
|
|
archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
|
|
|
ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
|
|
|
|
|
|
/* If another format was already registered, unregister it. */
|
|
|
if (a->format_free != NULL) {
|
|
|
(a->format_free)(a);
|
|
|
}
|
|
|
|
|
|
w = malloc(sizeof(*w));
|
|
|
if (w == NULL) {
|
|
|
archive_set_error(&a->archive, ENOMEM,
|
|
|
"Can't allocate warc data");
|
|
|
return (ARCHIVE_FATAL);
|
|
|
}
|
|
|
/* by default we're emitting a file wide header */
|
|
|
w->omit_warcinfo = 0U;
|
|
|
/* obtain current time for date fields */
|
|
|
w->now = time(NULL);
|
|
|
/* reset file type info */
|
|
|
w->typ = 0;
|
|
|
/* also initialise our rng */
|
|
|
w->rng = (unsigned int)w->now;
|
|
|
|
|
|
a->format_data = w;
|
|
|
a->format_name = "WARC/1.0";
|
|
|
a->format_options = _warc_options;
|
|
|
a->format_write_header = _warc_header;
|
|
|
a->format_write_data = _warc_data;
|
|
|
a->format_close = _warc_close;
|
|
|
a->format_free = _warc_free;
|
|
|
a->format_finish_entry = _warc_finish_entry;
|
|
|
a->archive.archive_format = ARCHIVE_FORMAT_WARC;
|
|
|
a->archive.archive_format_name = "WARC/1.0";
|
|
|
return (ARCHIVE_OK);
|
|
|
}
|
|
|
|
|
|
|
|
|
/* archive methods */
|
|
|
static int
|
|
|
_warc_options(struct archive_write *a, const char *key, const char *val)
|
|
|
{
|
|
|
struct warc_s *w = a->format_data;
|
|
|
|
|
|
if (strcmp(key, "omit-warcinfo") == 0) {
|
|
|
if (val == NULL || strcmp(val, "true") == 0) {
|
|
|
/* great */
|
|
|
w->omit_warcinfo = 1U;
|
|
|
return (ARCHIVE_OK);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
/* Note: The "warn" return is just to inform the options
|
|
|
* supervisor that we didn't handle it. It will generate
|
|
|
* a suitable error if no one used this option. */
|
|
|
return (ARCHIVE_WARN);
|
|
|
}
|
|
|
|
|
|
static int
|
|
|
_warc_header(struct archive_write *a, struct archive_entry *entry)
|
|
|
{
|
|
|
struct warc_s *w = a->format_data;
|
|
|
struct archive_string hdr;
|
|
|
#define MAX_HDR_SIZE 512
|
|
|
|
|
|
/* check whether warcinfo record needs outputting */
|
|
|
if (!w->omit_warcinfo) {
|
|
|
ssize_t r;
|
|
|
warc_essential_hdr_t wi = {
|
|
|
WT_INFO,
|
|
|
/*uri*/NULL,
|
|
|
/*urn*/NULL,
|
|
|
/*rtm*/0,
|
|
|
/*mtm*/0,
|
|
|
/*cty*/"application/warc-fields",
|
|
|
/*len*/sizeof(warcinfo) - 1U,
|
|
|
};
|
|
|
wi.rtime = w->now;
|
|
|
wi.mtime = w->now;
|
|
|
|
|
|
archive_string_init(&hdr);
|
|
|
r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
|
|
|
if (r >= 0) {
|
|
|
/* jackpot! */
|
|
|
/* now also use HDR buffer for the actual warcinfo */
|
|
|
archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
|
|
|
|
|
|
/* append end-of-record indicator */
|
|
|
archive_strncat(&hdr, "\r\n\r\n", 4);
|
|
|
|
|
|
/* write to output stream */
|
|
|
__archive_write_output(a, hdr.s, archive_strlen(&hdr));
|
|
|
}
|
|
|
/* indicate we're done with file header writing */
|
|
|
w->omit_warcinfo = 1U;
|
|
|
archive_string_free(&hdr);
|
|
|
}
|
|
|
|
|
|
if (archive_entry_pathname(entry) == NULL) {
|
|
|
archive_set_error(&a->archive, EINVAL,
|
|
|
"Invalid filename");
|
|
|
return (ARCHIVE_WARN);
|
|
|
}
|
|
|
|
|
|
w->typ = archive_entry_filetype(entry);
|
|
|
w->populz = 0U;
|
|
|
if (w->typ == AE_IFREG) {
|
|
|
warc_essential_hdr_t rh = {
|
|
|
WT_RSRC,
|
|
|
/*uri*/NULL,
|
|
|
/*urn*/NULL,
|
|
|
/*rtm*/0,
|
|
|
/*mtm*/0,
|
|
|
/*cty*/NULL,
|
|
|
/*len*/0,
|
|
|
};
|
|
|
ssize_t r;
|
|
|
rh.tgturi = archive_entry_pathname(entry);
|
|
|
rh.rtime = w->now;
|
|
|
rh.mtime = archive_entry_mtime(entry);
|
|
|
rh.cntlen = (size_t)archive_entry_size(entry);
|
|
|
|
|
|
archive_string_init(&hdr);
|
|
|
r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
|
|
|
if (r < 0) {
|
|
|
/* don't bother */
|
|
|
archive_set_error(
|
|
|
&a->archive,
|
|
|
ARCHIVE_ERRNO_FILE_FORMAT,
|
|
|
"cannot archive file");
|
|
|
return (ARCHIVE_WARN);
|
|
|
}
|
|
|
/* otherwise append to output stream */
|
|
|
__archive_write_output(a, hdr.s, r);
|
|
|
/* and let subsequent calls to _data() know about the size */
|
|
|
w->populz = rh.cntlen;
|
|
|
archive_string_free(&hdr);
|
|
|
return (ARCHIVE_OK);
|
|
|
}
|
|
|
/* just resort to erroring as per Tim's advice */
|
|
|
archive_set_error(
|
|
|
&a->archive,
|
|
|
ARCHIVE_ERRNO_FILE_FORMAT,
|
|
|
"WARC can only process regular files");
|
|
|
return (ARCHIVE_FAILED);
|
|
|
}
|
|
|
|
|
|
static ssize_t
|
|
|
_warc_data(struct archive_write *a, const void *buf, size_t len)
|
|
|
{
|
|
|
struct warc_s *w = a->format_data;
|
|
|
|
|
|
if (w->typ == AE_IFREG) {
|
|
|
int rc;
|
|
|
|
|
|
/* never write more bytes than announced */
|
|
|
if (len > w->populz) {
|
|
|
len = (size_t)w->populz;
|
|
|
}
|
|
|
|
|
|
/* now then, out we put the whole shebang */
|
|
|
rc = __archive_write_output(a, buf, len);
|
|
|
if (rc != ARCHIVE_OK) {
|
|
|
return rc;
|
|
|
}
|
|
|
}
|
|
|
return len;
|
|
|
}
|
|
|
|
|
|
static int
|
|
|
_warc_finish_entry(struct archive_write *a)
|
|
|
{
|
|
|
static const char _eor[] = "\r\n\r\n";
|
|
|
struct warc_s *w = a->format_data;
|
|
|
|
|
|
if (w->typ == AE_IFREG) {
|
|
|
int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
|
|
|
|
|
|
if (rc != ARCHIVE_OK) {
|
|
|
return rc;
|
|
|
}
|
|
|
}
|
|
|
/* reset type info */
|
|
|
w->typ = 0;
|
|
|
return (ARCHIVE_OK);
|
|
|
}
|
|
|
|
|
|
static int
|
|
|
_warc_close(struct archive_write *a)
|
|
|
{
|
|
|
(void)a; /* UNUSED */
|
|
|
return (ARCHIVE_OK);
|
|
|
}
|
|
|
|
|
|
static int
|
|
|
_warc_free(struct archive_write *a)
|
|
|
{
|
|
|
struct warc_s *w = a->format_data;
|
|
|
|
|
|
free(w);
|
|
|
a->format_data = NULL;
|
|
|
return (ARCHIVE_OK);
|
|
|
}
|
|
|
|
|
|
|
|
|
/* private routines */
|
|
|
static void
|
|
|
xstrftime(struct archive_string *as, const char *fmt, time_t t)
|
|
|
{
|
|
|
/** like strftime(3) but for time_t objects */
|
|
|
struct tm *rt;
|
|
|
#if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S)
|
|
|
struct tm timeHere;
|
|
|
#endif
|
|
|
char strtime[100];
|
|
|
size_t len;
|
|
|
|
|
|
#ifdef HAVE_GMTIME_R
|
|
|
if ((rt = gmtime_r(&t, &timeHere)) == NULL)
|
|
|
return;
|
|
|
#elif defined(HAVE__GMTIME64_S)
|
|
|
_gmtime64_s(&timeHere, &t);
|
|
|
#else
|
|
|
if ((rt = gmtime(&t)) == NULL)
|
|
|
return;
|
|
|
#endif
|
|
|
/* leave the hard yacker to our role model strftime() */
|
|
|
len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
|
|
|
archive_strncat(as, strtime, len);
|
|
|
}
|
|
|
|
|
|
static ssize_t
|
|
|
_popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
|
|
|
{
|
|
|
static const char _ver[] = "WARC/1.0\r\n";
|
|
|
static const char * const _typ[LAST_WT] = {
|
|
|
NULL, "warcinfo", "metadata", "resource", NULL
|
|
|
};
|
|
|
char std_uuid[48U];
|
|
|
|
|
|
if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
|
|
|
/* brilliant, how exactly did we get here? */
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
archive_strcpy(tgt, _ver);
|
|
|
|
|
|
archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
|
|
|
|
|
|
if (hdr.tgturi != NULL) {
|
|
|
/* check if there's a xyz:// */
|
|
|
static const char _uri[] = "";
|
|
|
static const char _fil[] = "file://";
|
|
|
const char *u;
|
|
|
char *chk = strchr(hdr.tgturi, ':');
|
|
|
|
|
|
if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
|
|
|
/* yep, it's definitely a URI */
|
|
|
u = _uri;
|
|
|
} else {
|
|
|
/* hm, best to prepend file:// then */
|
|
|
u = _fil;
|
|
|
}
|
|
|
archive_string_sprintf(tgt,
|
|
|
"WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
|
|
|
}
|
|
|
|
|
|
/* record time is usually when the http is sent off,
|
|
|
* just treat the archive writing as such for a moment */
|
|
|
xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
|
|
|
|
|
|
/* while we're at it, record the mtime */
|
|
|
xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
|
|
|
|
|
|
if (hdr.recid == NULL) {
|
|
|
/* generate one, grrrr */
|
|
|
warc_uuid_t u;
|
|
|
|
|
|
_gen_uuid(&u);
|
|
|
/* Unfortunately, archive_string_sprintf does not
|
|
|
* handle the minimum number following '%'.
|
|
|
* So we have to use snprintf function here instead
|
|
|
* of archive_string_snprintf function. */
|
|
|
#if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
|
|
|
#define snprintf _snprintf
|
|
|
#endif
|
|
|
snprintf(
|
|
|
std_uuid, sizeof(std_uuid),
|
|
|
"<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
|
|
|
u.u[0U],
|
|
|
u.u[1U] >> 16U, u.u[1U] & 0xffffU,
|
|
|
u.u[2U] >> 16U, u.u[2U] & 0xffffU,
|
|
|
u.u[3U]);
|
|
|
hdr.recid = std_uuid;
|
|
|
}
|
|
|
|
|
|
/* record-id is mandatory, fingers crossed we won't fail */
|
|
|
archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
|
|
|
|
|
|
if (hdr.cnttyp != NULL) {
|
|
|
archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
|
|
|
}
|
|
|
|
|
|
/* next one is mandatory */
|
|
|
archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
|
|
|
/**/
|
|
|
archive_strncat(tgt, "\r\n", 2);
|
|
|
|
|
|
return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
|
|
|
}
|
|
|
|
|
|
static int
|
|
|
_gen_uuid(warc_uuid_t *tgt)
|
|
|
{
|
|
|
archive_random(tgt->u, sizeof(tgt->u));
|
|
|
/* obey uuid version 4 rules */
|
|
|
tgt->u[1U] &= 0xffff0fffU;
|
|
|
tgt->u[1U] |= 0x4000U;
|
|
|
tgt->u[2U] &= 0x3fffffffU;
|
|
|
tgt->u[2U] |= 0x80000000U;
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
/* archive_write_set_format_warc.c ends here */
|