Home Reference Source

lib/writers/puppeteer.js

const { URL } = require('url')
const { STATUS_CODES } = require('http')
const { CRLF } = require('./warcFields')
const WARCWriterBase = require('./warcWriterBase')
const {
  stringifyHeaders,
  stringifyRequestHeaders
} = require('../utils/headerSerializers')
const { httpRequestPath } = require('../utils/url-helpers')
const { noGZ, replaceContentLen } = require('./constants')

/**
 * @extends {WARCWriterBase}
 * @desc WARC Generator for use with puppeteer
 * @see https://github.com/GoogleChrome/puppeteer
 */
class PuppeteerWARCGenerator extends WARCWriterBase {
  /**
   * @desc Create a new PuppeteerWARCGenerator
   * @param {?WARCFileOpts} [defaultOpts]
   */
  constructor (defaultOpts) {
    super(defaultOpts)
    /**
     * @type {URL}
     * @private
     */
    this._UP = new URL('about:blank')
  }

  /**
   * @param {PuppeteerRequestCapturer} capturer - The Puppeteer request capturer that contains requests
   * to be serialized to the WARC
   * @param {WARCGenOpts} genOpts - Options for generating the WARC and optionally generating
   * WARC info, WARC info + Webrecorder Player bookmark list, metadata records
   * @return {Promise<void>} - A Promise that resolves when WARC generation is complete
   */
  async generateWARC (capturer, genOpts) {
    const { warcOpts, winfo, metadata } = genOpts
    this.initWARC(warcOpts.warcPath, warcOpts)
    if (winfo != null) {
      await this.writeWarcInfoRecord(winfo)
    }
    if (genOpts.pages) {
      await this.writeWebrecorderBookmarksInfoRecord(genOpts.pages)
    }
    if (metadata != null) {
      await this.writeWarcMetadata(metadata.targetURI, metadata.content)
    }
    for (let request of capturer.iterateRequests()) {
      try {
        await this.generateWarcEntry(request)
      } catch (error) {
        /* istanbul ignore next */
        console.error(error)
      }
    }
    return new Promise(resolve => {
      this.once('finished', resolve)
      this.end()
    })
  }

  /**
   * @desc Generate a WARC record
   * @param {Request} request - A Puppeteer Request object
   * @return {Promise<void>}
   */
  async generateWarcEntry (request) {
    if (request.url().indexOf('data:') === 0) return
    const response = request.response()
    this._UP.href = request.url()
    let reqHTTP = `${request.method()} ${httpRequestPath(
      this._UP
    )} HTTP/1.1${CRLF}${stringifyRequestHeaders(
      request.headers(),
      this._UP.host
    )}`
    if (response) {
      let resHTTP = `HTTP/1.1 ${response.status()} ${
        STATUS_CODES[response.status()]
      } ${CRLF}${stringifyHeaders(response.headers())}`
      let body
      let wasError = false
      try {
        body = await response.buffer()
      } catch (e) {
        wasError = true
      }
      if (!wasError) {
        resHTTP = resHTTP.replace(noGZ, '')
        resHTTP = resHTTP.replace(
          replaceContentLen,
          `Content-Length: ${Buffer.byteLength(body, 'utf8')}${CRLF}`
        )
      } else {
        // indicate that this record has 0 content
        resHTTP = resHTTP.replace(
          replaceContentLen,
          `Content-Length: 0${CRLF}`
        )
      }
      return this.writeRequestResponseRecords(
        this._UP.href,
        { headers: reqHTTP, data: request.postData() },
        { headers: resHTTP, data: body }
      )
    }
    return this.writeRequestRecord(this._UP.href, reqHTTP, request.postData())
  }
}

/**
 * @type {PuppeteerWARCGenerator}
 */
module.exports = PuppeteerWARCGenerator