Reference Source

lib/crawler/chrome.js

/*
 Squidwarc  Copyright (C) 2017-Present John Berlin <n0tan3rd@gmail.com>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 Squidwarc is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this Squidwarc.  If not, see <http://www.gnu.org/licenses/>
 */
const EventEmitter = require('eventemitter3')
const RequestMonitor = require('../../node-warc/lib/requestCapturers/remoteChrome')
const RemoteChromeWARCGenerator = require('../../node-warc/lib/writers/remoteChrome')
const defaults = require('../defaults')
const Launcher = require('../launcher/chrome')
const NavMan = require('./navigationMan')
const InjectManager = require('../injectManager')

/**
 * @desc Crawler based on cyrus-and/chrome-remote-interface
 * @extends {EventEmitter}
 */
class ChromeCrawler extends EventEmitter {
  /**
   * @desc Create a new ChromeCrawler instance. For a description of the expected options see the
   * JSDoc CrawlConfig typedef {@link CrawlConfig}
   * @param {CrawlConfig} options - The crawl config for this crawl
   */
  constructor (options) {
    super()
    /**
     * @desc Crawl configuration options
     * @type {CrawlConfig}
     */
    this.options = options

    /**
     * @desc Devtools protocol client for issuing commands to the browser
     * @type {CRI}
     * @private
     */
    this._client = null

    /**
     * @desc Flag indicating if once the process exists should the crawler close the browser
     * @type {boolean}
     * @private
     */
    this._autoClose = false

    /**
     * @desc Handles the tracking and capturing of the HTTP requests made by the browser
     * @type {RequestMonitor}
     */
    this.requestMonitor = null

    /**
     * @desc The current url the crawler is visiting
     * @type {?string}
     */
    this._currentUrl = null

    /**
     * @desc WARC generator for use with cyrus-and/chrome-remote-interface
     * @type {RemoteChromeWARCGenerator}
     * @private
     */
    this._warcGenerator = new RemoteChromeWARCGenerator()

    /**
     * @desc Manger for detecting network-idle, if we have not navigated or if we have reached the global wait time
     * @type {NavigationMan}
     * @private
     */
    this._navMan = new NavMan(options.crawlControl, this)

    /**
     * @desc The UserAgent string of the remote instance we are connecting to
     * @type {string}
     * @private
     */
    this._ua = ''

    this.init = this.init.bind(this)
    this._onWARCGenFinished = this._onWARCGenFinished.bind(this)
    this._onWARCGenError = this._onWARCGenError.bind(this)
    this._close = this._close.bind(this)
    this._didNavigate = this._didNavigate.bind(this)
  }

  /**
   * @emits {connected} when the required setup is done
   * @desc Connect to the Chrome instance the crawler will be using and setup crawler
   * @return {Promise<void>}
   */
  async init () {
    if (this.options.chrome.launch) {
      this._client = await Launcher.launch(this.options.chrome)
    } else {
      this._client = await Launcher.connect(this.options.chrome)
    }
    this._warcGenerator.on('finished', this._onWARCGenFinished)
    this._warcGenerator.on('error', this._onWARCGenError)
    let wasError = false
    try {
      await this._client.Runtime.enable()
      await this._client.Page.enable()
      await this._client.Network.enable()
    } catch (err) {
      wasError = true
      this.emit('error', { type: 'enabling-browser-hooks', err })
    }
    if (!wasError) {
      await this._client.Animation.setPlaybackRate({ playbackRate: 1000 })
      this._ua = await this.getUserAgent()
      await this._initInjects()
      this.requestMonitor = new RequestMonitor(this._client.Network, this._navMan)
      this.emit('connected')
    }
  }

  /**
   * @desc Instruct the browsers to inject JavaScript into every page
   * @return {Promise<void>}
   * @private
   */
  async _initInjects () {
    if (this._client.Page.addScriptToEvaluateOnNewDocument) {
      await this._client.Page.addScriptToEvaluateOnNewDocument(
        InjectManager.getCrawlInjects()
      )
    } else {
      await this._client.Page.addScriptToEvaluateOnLoad(
        InjectManager.getCrawlInjects(true)
      )
    }
  }

  /**
   * @desc Navigate to a new Web Page
   * @param {string} url - The url to navigate the browser to
   */
  navigate (url) {
    this._currentUrl = url
    this.requestMonitor.startCapturing()
    this._client.Page.navigate({ url }, this._navMan.didNavigate)
    this._navMan.startedNav(url)
  }

  /**
   * @desc Equivalent to hitting the refresh button when it is an X
   * @return {Promise<any>}
   */
  stopPageLoading () {
    return this._client.Page.stopLoading()
  }

  /**
   * @desc Stop capturing the current web pages network requests
   */
  stopCapturingNetwork () {
    this.requestMonitor.stopCapturing()
  }

  /**
   * @desc Stop the page loading and stop capturing requests
   * @return {Promise<void>}
   */
  stop () {
    this.requestMonitor.stopCapturing()
    return this._client.Page.stopLoading()
  }

  /**
   * @desc Disconnect from the Chrome instance currently attached to
   */
  shutdown () {
    this._client.close()
    process.exit()
  }

  /**
   * @desc Initialize the WARC writter for writting a new WARC
   * @param {string} warcPath           - the path to the new WARC
   * @param {boolean} [appending=false] - append to an already existing WARC file
   */
  initWARC (warcPath, appending = false) {
    this._warcGenerator.initWARC(warcPath, appending)
  }

  /**
   * @desc Alias for {@link genWarc}
   * @param {!Object} warcInfo    - WARC record information
   * @property {!string} outlinks - Pre-formatted string containing the pages outlinks tobe used by the WARC metadata record
   * @property {?Object} info     - Information for the WARC info record
   * @return {Promise<void, Error>}
   */
  genWARC (warcInfo) {
    return this.genWarc(warcInfo)
  }

  /**
   * @desc Generate the WARC file
   * @param {!Object} warcInfo          - WARC record information
   * @property {!string} outlinks - Pre-formatted string containing the pages outlinks tobe used by the WARC metadata record
   * @property {?Object} info     - Information for the WARC info record
   * @return {Promise<void, Error>}
   */
  async genWarc ({ info, outlinks }) {
    info = info || {}
    info.isPartOfV = info.isPartOfV || this.options.versionInfo.isPartOfV
    info.warcInfoDescription =
      info.warcInfoDescription || this.options.versionInfo.warcInfoDescription
    await this._warcGenerator.writeWarcInfoRecord(
      info.isPartOfV,
      info.warcInfoDescription,
      this._ua
    )
    await this._warcGenerator.writeWarcMetadataOutlinks(this._currentUrl, outlinks)
    this.requestMonitor.stopCapturing()
    for (let nreq of this.requestMonitor.iterateRequests()) {
      try {
        await this._warcGenerator.generateWarcEntry(nreq, this._client.Network)
      } catch (error) {
        console.error(error)
      }
    }
    this._warcGenerator.end()
  }

  /**
   * @desc Generate the WARC Info and Metadata records
   * @param {!Object} warcInfo    - WARC record information
   * @property {!string} outlinks - Pre-formatted string containing the pages outlinks tobe used by the WARC metadata record
   * @property {?Object} info     - Information for the WARC info record
   * @return {!Promise<void>}
   */
  async genInfoMetaDataRecord ({ info, outlinks }) {
    info = info || {}
    info.v = info.v || this.options.versionInfo.v
    info.isPartOfV = info.isPartOfV || this.options.versionInfo.isPartOfV
    info.warcInfoDescription =
      info.warcInfoDescription || this.options.versionInfo.warcInfoDescription
    await this._warcGenerator.writeWarcInfoRecord(
      info.isPartOfV,
      info.warcInfoDescription,
      this._ua || defaults.defaultOpts.UA
    )
    await this._warcGenerator.writeWarcMetadataOutlinks(this._currentUrl, outlinks)
  }

  /**
   * @desc Retrieve the page's meta information
   * @return {Promise<{outlinks: string, links: string[], location: string}, Error>}
   */
  async getOutLinks () {
    let evaled = await this._client.Runtime.evaluate(InjectManager.getCollectInject())
    return evaled.result.value
  }

  /**
   * @desc Retrieve the browsers user-agent string
   * @return {Promise<string>}
   */
  async getUserAgent () {
    let { userAgent } = await this._client.Browser.getVersion()
    if (userAgent.indexOf('HeadlessChrome/') !== -1) {
      // We are not a robot, pinkie promise!
      userAgent = userAgent.replace('HeadlessChrome/', 'Chrome/')
      await this._client.Network.setUserAgentOverride({ userAgent })
    }
    return userAgent
  }

  /**
   * @desc Iterate over the captured network requests for the current web page
   * @return {Iterator<CapturedRequest>}
   */
  [Symbol.iterator] () {
    return this.requestMonitor.values()
  }

  /**
   * @desc Callback used for Page.navigate
   * @private
   */
  _didNavigate () {
    this._navMan.didNavigate()
  }

  /**
   * @desc Enable auto closing of the connection to the remote browser
   * @return {ChromeCrawler}
   * @private
   */
  enableAutoClose () {
    if (!this._autoClose) {
      process.on('exit', this._close)
    }
    this._autoClose = true
    return this
  }

  /**
   * @desc Callback for process.on('exit')
   * @private
   */
  _close () {
    if (this._client) {
      return this._client.close()
    }
  }

  /**
   * @desc Listener for warc generator error
   * @param {Error} err
   * @private
   */
  _onWARCGenError (err) {
    this.emit('error', { type: 'warc-gen', err })
  }

  /**
   * @desc Listener for warc generator finished
   * @private
   */
  _onWARCGenFinished () {
    this.emit('warc-gen-finished')
  }

  /**
   * @desc Create a new {@link ChromeCrawler} instance with auto close enabled
   * @param {CrawlConfig} options - The crawl config for this crawl
   * @return {ChromeCrawler}
   */
  static withAutoClose (options) {
    return new ChromeCrawler(options).enableAutoClose()
  }
}

/**
 * @type {ChromeCrawler}
 */
module.exports = ChromeCrawler