lib/warcRecord/builder.js
- 'use strict'
- const WARCRecord = require('./record')
- const { crlf, begin } = require('./fieldIdentifiers')
-
- /**
- * @type {{header: symbol, content1: symbol, content2: symbol, consumeCRLFHeader: symbol, consumeCRLFContent1: symbol, consumeCRLFContent2: symbol}}
- */
- const parsingStates = {
- header: Symbol('warc-parsing-header'),
- content1: Symbol('warc-parsing-content1'),
- content2: Symbol('warc-parsing-content2'),
- consumeCRLFHeader: Symbol('warc-parsing-comsume-crlf-header'),
- consumeCRLFContent1: Symbol('warc-parsing-comsume-crlf-c1'),
- consumeCRLFContent2: Symbol('warc-parsing-comsume-crlf-c2')
- }
-
- /**
- * @type {number}
- */
- const WFIBeginLen = begin.length
-
- /**
- * @param {Buffer} line
- * @returns {boolean}
- */
- function isJustCRLF (line) {
- if (line.length !== 2) return false
- return line[0] === crlf[0] && line[1] === crlf[1]
- }
-
- /**
- * @param {Buffer} line
- * @returns {boolean}
- */
- function isWARCRevisionLine (line) {
- if (line.length > 11) return false
- let i = 0
- while (i < WFIBeginLen) {
- if (begin[i] !== line[i]) return false
- i += 1
- }
- return true
- }
-
- /**
- * @desc Progressively builds warc records by consuming the file line by line
- */
- class RecordBuilder {
- /**
- * @desc Create a new RecordBuilder
- */
- constructor () {
- /**
- * @type {{header: Buffer[], c1: Buffer[], c2: Buffer[]}}
- * @private
- */
- this._parts = {
- header: [],
- c1: [],
- c2: []
- }
-
- /**
- * @type {symbol}
- * @private
- */
- this._parsingState = parsingStates.header
- }
-
- /**
- * @desc Returns a new WARC record if one can be created otherwise returns null
- * @returns {?WARCRecord}
- */
- buildRecord () {
- if (this._parts.header.length === 0) return null
- const newRecord = new WARCRecord(this._parts)
- this._parts.header = []
- this._parts.c1 = []
- this._parts.c2 = []
- return newRecord
- }
-
- /**
- * @desc Consumes a line of a WARC file.
- * If a record can be built this function returns a new WARCRecord otherwise null
- * @param {Buffer} line - The line to be consumed
- * @returns {?WARCRecord}
- */
- consumeLine (line) {
- let newRecord = null
- if (isWARCRevisionLine(line)) {
- this._parsingState = parsingStates.header
- newRecord = this.buildRecord()
- }
- const isSep = isJustCRLF(line)
- switch (this._parsingState) {
- case parsingStates.header:
- if (!isSep) {
- this._parts.header.push(line)
- } else {
- this._parsingState = parsingStates.consumeCRLFHeader
- }
- break
- case parsingStates.consumeCRLFHeader:
- if (!isSep) {
- this._parts.c1.push(line)
- this._parsingState = parsingStates.content1
- }
- break
- case parsingStates.content1:
- if (!isSep) {
- this._parts.c1.push(line)
- } else {
- this._parsingState = parsingStates.consumeCRLFContent1
- }
- break
- case parsingStates.consumeCRLFContent1:
- if (!isSep) {
- this._parts.c2.push(line)
- this._parsingState = parsingStates.content2
- }
- break
- case parsingStates.content2:
- if (!isSep) {
- this._parts.c2.push(line)
- } else {
- this._parsingState = parsingStates.consumeCRLFContent2
- }
- break
- case parsingStates.consumeCRLFContent2:
- break
- }
- return newRecord
- }
- }
-
- /**
- * @type {RecordBuilder}
- */
- module.exports = RecordBuilder