lib/writers/warcFields.js
/**
* @type {string}
*/
const CRLF = '\r\n'
/**
* @type {string}
*/
const CRLF2x = '\r\n\r\n'
/**
* @type {string}
*/
const recordSeparator = '\r\n\r\n'
/**
* @type {string}
*/
const WARCV = '1.0'
/**
* @type {string}
*/
const WARCSlashV = `WARC/${WARCV}${CRLF}`
/**
* @type {{warcinfo: string, metadata: string, request: string, response: string, revisit: string, resource: string, conversion: string, unknown: string, continuation: string}}
*/
const WARCTypes = {
warcinfo: 'warcinfo',
metadata: 'metadata',
request: 'request',
response: 'response',
revisit: 'revisit',
resource: 'resource',
conversion: 'conversion',
unknown: 'unknown',
continuation: 'continuation'
}
const WARCContentTypes = {
warcFields: 'Content-Type: application/warc-fields\r\n',
httpRequest: 'Content-Type: application/http; msgtype=request\r\n',
httpResponse: 'Content-Type: application/http; msgtype=response\r\n'
}
/**
* @param {string} uuid
* @return {string}
*/
function recordId (uuid) {
return `WARC-Record-ID: <urn:uuid:${uuid}>${CRLF}`
}
/**
* @param {string} date
* @return {string}
*/
function warcDate (date) {
return `WARC-Date: ${date}${CRLF}`
}
/**
* @param {string} targetURI
* @return {string}
*/
function warcTargetURI (targetURI) {
return `WARC-Target-URI: ${targetURI}${CRLF}`
}
/**
* @param {string} type
* @return {string}
*/
function warcType (type) {
return `WARC-Type: ${type}${CRLF}`
}
/**
* @param {string} fileName
* @return {string}
*/
function warcFilename (fileName) {
return `WARC-Filename: ${fileName}${CRLF}`
}
/**
* @param {string|number} contentLen
* @return {string}
*/
function warcContentLength (contentLen) {
return `Content-Length: ${contentLen}${CRLF}`
}
/**
* @param {string} contentType
* @return {string}
*/
function warcContentType (contentType) {
return `Content-Type: ${contentType}${CRLF}`
}
/**
* @param {string} concurrentTo
* @return {string}
*/
function warcConcurrentTo (concurrentTo) {
return `WARC-Concurrent-To: <urn:uuid:${concurrentTo}>${CRLF}`
}
/**
* @param {string} wid
* @return {string}
*/
function warcWarcInfoId (wid) {
return `WARC-Warcinfo-ID: <urn:uuid:${wid}>${CRLF}`
}
/**
*
* @param {string} type
* @param {Object} reqFields
* @param {string} reqFields.rid
* @param {string} reqFields.date
* @param {number} reqFields.len
* @param {?string} [reqFields.targetURI]
* @return {string}
*/
function requiredHeaderFields (type, { rid, date, len, targetURI }) {
if (targetURI != null) {
return `${WARCSlashV}${warcType(type)}${recordId(rid)}${warcDate(
date
)}${warcTargetURI(targetURI)}${warcContentLength(len)}`
}
return `${WARCSlashV}${warcType(type)}${recordId(rid)}${warcDate(
date
)}${warcContentLength(len)}`
}
/**
* @typedef {Object} WARCInfoHeader
* @property {string} date - The date value for WARC-Date
* @property {?string} [fileName] - The name of the warc file
* @property {?string} [targetURI] - The target URI for the record
* @property {string} rid - The id of the record
* @property {number} len - The length of the records content
*/
/**
* @param {WARCInfoHeader} infoHeader
* @return {string}
*/
function warcInfoHeader (infoHeader) {
const required = `${requiredHeaderFields(WARCTypes.warcinfo, infoHeader)}${
WARCContentTypes.warcFields
}`
if (infoHeader.fileName != null) {
return `${required}${warcFilename(infoHeader.fileName)}`
}
return required
}
/**
* @param {Object} infoContent
* @return {string}
*/
function warcInfoContent (infoContent) {
const base = [`format: WARC File Format ${WARCV}${CRLF}`]
for (let key in infoContent) {
base.push(`${key}: ${infoContent[key]}${CRLF}`)
}
return base.join('')
}
/**
* @typedef {Object} WARCMetadataHeader
* @property {string} targetURI - The URI the records are for
* @property {string} now - The date value for WARC-Date
* @property {string} concurrentTo - The record id this metadata record is associated with
* @property {string} rid - The record id of this record
* @property {number} len - The length of this records content
* @property {?string} wid - The record id of the Warcinfo record
*/
/**
* @param {WARCMetadataHeader} metadataHeader
* @returns {string}
*/
function warcMetadataHeader ({ targetURI, now, concurrentTo, rid, len, wid }) {
const base = [
requiredHeaderFields(WARCTypes.metadata, {
date: now,
len,
rid,
targetURI
}),
WARCContentTypes.warcFields
]
if (concurrentTo != null) {
base.push(warcConcurrentTo(concurrentTo))
}
if (wid != null) {
base.push(warcWarcInfoId(wid))
}
return base.join('')
}
/**
* @typedef {Object} WARCRequestHeader
* @property {string} targetURI - The URI the record is for
* @property {string} now - The date value for WARC-Date
* @property {string} concurrentTo - The record id of the record this record associated with
* @property {string} rid - The record id of this record
* @property {number} len - The length of this records content
* @property {?string} wid - The record id of the Warcinfo record
*/
/**
* @param {WARCRequestHeader} requestHeader
* @returns {string}
*/
function warcRequestHeader ({ targetURI, now, concurrentTo, rid, len, wid }) {
const base = [
requiredHeaderFields(WARCTypes.request, { date: now, len, rid, targetURI }),
WARCContentTypes.httpRequest
]
if (concurrentTo != null) {
base.push(warcConcurrentTo(concurrentTo))
}
if (wid != null) {
base.push(warcWarcInfoId(wid))
}
return base.join('')
}
/**
* @typedef {Object} WARCResponseHeader
* @property {string} targetURI - The URI the record is for
* @property {string} now - The date value for WARC-Date
* @property {string} rid - The record id of this record
* @property {number} len - The length of this records content
* @property {?string} wid - The record id of the Warcinfo record
*/
/**
* @param {WARCResponseHeader} responseHeader
* @returns {string}
*/
function warcResponseHeader ({ targetURI, now, rid, len, wid }) {
const base = [
requiredHeaderFields(WARCTypes.response, {
date: now,
len,
rid,
targetURI
}),
WARCContentTypes.httpResponse
]
if (wid != null) {
base.push(warcWarcInfoId(wid))
}
return base.join('')
}
module.exports = {
requiredHeaderFields,
/**
* @type {function(headerInfo: WARCInfoHeader): string}
*/
warcInfoHeader,
/**
* @type {function(infoContent: Object): string}
*/
warcInfoContent,
/**
* @type {function(reqiestData: WARCRequestHeader): string}
*/
warcRequestHeader,
warcResponseHeader,
warcMetadataHeader,
recordSeparator,
CRLF,
CRLF2x,
WARCTypes,
WARCV,
WARCContentTypes
}