Reference Source

lib/crawler/puppeteer.js

/*
 Squidwarc  Copyright (C) 2017-present  John Berlin <n0tan3rd@gmail.com>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 Squidwarc is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this Squidwarc.  If not, see <http://www.gnu.org/licenses/>
 */
const EventEmitter = require('eventemitter3')
const { Browser } = require('puppeteer/lib/Browser')
const { TimeoutError } = require('puppeteer/lib/Errors')
const autobind = require('class-autobind')
const PuppeteerCDPRequestCapturer = require('../../node-warc/lib/requestCapturers/puppeteerCDP')
const PuppeteerCDPWARCGenerator = require('../../node-warc/lib/writers/puppeteerCDP')
const InjectManager = require('../injectManager')
const launch = require('../launcher/puppeteer')
const NetIdle = require('./netIdleWatcher')
const cp = require('../utils/colorPrinters')
const { configureURLNormalizer } = require('../utils/urlUtils')

/**
 * @desc Function for normalizing URLs
 * @type {(function(string): string)}
 */
const urlNormalizer = configureURLNormalizer({ stripFragment: true, stripWWW: true })

/**
 * @desc Crawler based on puppeteer
 * @extends {EventEmitter}
 */
class PuppeteerCrawler extends EventEmitter {
  /**
   * @desc Create a new PuppeteerCrawler instance. For a description of the expected options see the
   * JSDoc CrawlConfig typedef {@link CrawlConfig}
   * @param {CrawlConfig} options - The crawl config for this crawl
   */
  constructor (options) {
    super()

    /**
     * @desc An instance of puppeteer Browser
     * @type {?Browser}
     */
    this._browser = null

    /**
     * @desc An instance of puppeteer Page
     * @type {?Page}
     */
    this._page = null

    /**
     * @desc An instance of puppeteer CDPSession used to
     * @type {CDPSession}
     * @private
     */
    this._client = null

    /**
     * @desc The UserAgent string of the browser
     * @type {string}
     * @private
     */
    this._ua = ''

    /**
     * @desc The current url the crawler is visiting
     * @type {?string}
     */
    this._currentUrl = null

    /**
     * @desc Crawl configuration options
     * @type {CrawlConfig}
     */
    this.options = options

    /**
     * @type {?PuppeteerCDPRequestCapturer}
     */
    this.requestCapturer = null

    /**
     * @type {PuppeteerCDPWARCGenerator}
     * @private
     */
    this._warcGenerator = new PuppeteerCDPWARCGenerator()

    /**
     * @desc Default wait time for page.goto
     * @type {{waitUntil: string, timeout: number}}
     */
    this.defaultWait = { waitUntil: 'networkidle0', timeout: 60000 }
    autobind.default(this, PuppeteerCrawler.prototype)
  }

  /**
   * @desc CB used to emit the disconnected event
   * @private
   */
  _onDisconnected () {
    this.emit('disconnected')
  }

  /**
   * @desc Setup the crawler
   */
  async init () {
    this._browser = await launch(this.options.chrome)
    this._browser.on(Browser.Events.Disconnected, this._onDisconnected)
    this._page = await this._browser.newPage()
    this._client = this._page._client

    await this._client.send('Animation.setPlaybackRate', { playbackRate: 1000 })
    await this._client.send('Network.setBypassServiceWorker', { bypass: true })
    await this._client.send(
      'Page.addScriptToEvaluateOnNewDocument',
      InjectManager.getNoNaughtyJsInject()
    )
    // so much noise
    this._client.removeAllListeners('Performance.metrics')
    this._client.removeAllListeners('Log.entryAdded')
    this._client.removeAllListeners('Runtime.consoleAPICalled')
    this._client.removeAllListeners('Runtime.exceptionThrown')

    this.requestCapturer = new PuppeteerCDPRequestCapturer()
    this.requestCapturer.attach(this._client)
    this._warcGenerator.on('finished', this._onWARCGenFinished)
    this._warcGenerator.on('error', this._onWARCGenError)
    this._ua = await this.getUserAgent()
  }

  /**
   * @desc Navigate the browser to the URL of the page to be crawled
   * @param {string} url
   * @returns {Promise<boolean>}
   */
  async navigate (url) {
    this._currentUrl = url
    this.requestCapturer.startCapturing()
    try {
      await this._page.goto(url, this.defaultWait)
    } catch (e) {
      if (e instanceof TimeoutError) {
        const mainFrame = this._page.mainFrame()
        if (
          urlNormalizer(url) ===
          urlNormalizer(mainFrame._navigationURL || mainFrame.url())
        ) {
          // we did navigate to the page but no all frame network idle was had in the waiting time
          return true
        }
      }
      cp.error('Crawler encountered a navigation error', e)
      return false
    }
    return true
  }

  /**
   * @desc If the user supplied a script that scrip is executed or if non was supplied just scroll the page
   * @return {Promise<void>}
   */
  async runUserScript () {
    if (this.options.script) {
      cp.cyan(`Running user script`)
      try {
        await this.options.script(this._page)
      } catch (e) {
        cp.error('An exception was thrown while running the user script', e)
      }
    } else {
      await this._page.evaluate(InjectManager.rawScoll())
    }
    await NetIdle.idlePromise(this._page, this.options.crawlControl)
  }

  /**
   * @desc Equivalent to hitting the refresh button when it is an X
   * @return {!Promise<?Object>}
   */
  stopPageLoading () {
    return this._client.send('Page.stopLoading')
  }

  /**
   * @desc Stop capturing the current web pages network requests
   */
  stopCapturingNetwork () {
    this.requestCapturer.stopCapturing()
  }

  /**
   * @desc Stop the page loading and stop capturing requests
   * @return {!Promise<?Object>}
   */
  stop () {
    this.requestCapturer.stopCapturing()
    return this._client.send('Page.stopLoading')
  }

  /**
   * @desc Stop crawling and exit
   * @return {Promise<void>}
   */
  async shutdown () {
    this.requestCapturer.stopCapturing()
    await this._browser.close()
  }

  /**
   * @desc Initialize the WARC writter for writting a new WARC
   * @param {string} warcPath           - the path to the new WARC
   * @param {boolean} [appending=false] - append to an already existing WARC file
   * @return {Promise<void>} A Promise that resolves once the `warc-gen-finished` event is emitted
   */
  initWARC (warcPath, appending = false) {
    this._warcGenerator.initWARC(warcPath, appending)
    return new Promise(resolve => {
      this.on('warc-gen-finished', resolve)
    })
  }

  /**
   * @desc Alias for {@link genWarc}
   * @param {Object} warcInfo    - WARC record information
   * @property {!string} outlinks - Pre-formatted string containing the pages outlinks tobe used by the WARC metadata record
   * @property {?Object} info     - Information for the WARC info record
   * @return {Promise<void, Error>}
   */
  genWARC (warcInfo) {
    return this.genWarc(warcInfo)
  }

  /**
   * @desc Generate the WARC file
   * @param {Object} warcInfo          - WARC record information
   * @property {!string} outlinks - Pre-formatted string containing the pages outlinks tobe used by the WARC metadata record
   * @property {?Object} info     - Information for the WARC info record
   * @return {Promise<void, Error>}
   */
  async genWarc ({ outlinks, info }) {
    info = info || {}
    info.isPartOfV = info.isPartOfV || this.options.versionInfo.isPartOfV
    info.warcInfoDescription =
      info.warcInfoDescription || this.options.versionInfo.warcInfoDescription
    await this._warcGenerator.writeWarcInfoRecord(
      info.isPartOfV,
      info.warcInfoDescription,
      this._ua
    )
    await this._warcGenerator.writeWarcMetadataOutlinks(this._currentUrl, outlinks)
    this.requestCapturer.stopCapturing()
    for (let nreq of this.requestCapturer.iterateRequests()) {
      try {
        await this._warcGenerator.generateWarcEntry(nreq, this._client)
      } catch (error) {
        console.error(error)
      }
    }
    this._warcGenerator.end()
  }

  /**
   * @desc Generate the WARC Info and Metadata records
   * @param {Object} warcInfo    - WARC record information
   * @property {!string} outlinks - Pre-formatted string containing the pages outlinks tobe used by the WARC metadata record
   * @property {?Object} info     - Information for the WARC info record
   * @return {!Promise<void>}
   */
  async genInfoMetaDataRecord ({ info, outlinks }) {
    info = info || {}
    info.v = info.v || this.options.versionInfo.v
    info.isPartOfV = info.isPartOfV || this.options.versionInfo.isPartOfV
    info.warcInfoDescription =
      info.warcInfoDescription || this.options.versionInfo.warcInfoDescription
    await this._warcGenerator.writeWarcInfoRecord(
      info.isPartOfV,
      info.warcInfoDescription,
      this._ua
    )
    await this._warcGenerator.writeWarcMetadataOutlinks(this._currentUrl, outlinks)
  }

  /**
   * @desc Retrieve the page's meta information
   * @return {Promise<{outlinks: string, links: Array<{href: string, pathname: string, host: string}>, location: string}, Error>}
   */
  async getOutLinks () {
    const frames = this._page.frames()
    let i = frames.length
    let frame
    const discoveredLinks = {
      outlinks: '',
      links: [],
      location: this._page.url()
    }
    const outlinksFN = InjectManager.rawOutLinks()
    while (i--) {
      frame = frames[i]
      try {
        const { outlinks, links } = await frame.evaluate(outlinksFN)
        discoveredLinks.outlinks += outlinks
        discoveredLinks.links = discoveredLinks.links.concat(links)
      } catch (e) {}
    }
    return discoveredLinks
  }

  /**
   * @desc Retrieve the browsers user-agent string
   * @return {!Promise<string>}
   */
  async getUserAgent () {
    let ua = await this._browser.userAgent()
    if (ua.indexOf('HeadlessChrome/') !== -1) {
      // We are not a robot, pinkie promise!
      ua = ua.replace('HeadlessChrome/', 'Chrome/')
      await this._page.setUserAgent(ua)
    }
    return ua
  }

  /**
   * @desc Iterate over the captured network requests for the current web page
   * @return {Iterator<CapturedRequest>}
   */
  [Symbol.iterator] () {
    return this.requestCapturer.values()
  }

  /**
   * @desc Listener for warc generator error
   * @param {Error} err - The error to emit
   * @private
   */
  _onWARCGenError (err) {
    this.emit('error', { type: 'warc-gen', err })
  }

  /**
   * @desc Listener for warc generator finished
   * @private
   */
  _onWARCGenFinished () {
    this.emit('warc-gen-finished')
  }
}

/**
 * @type {PuppeteerCrawler}
 */
module.exports = PuppeteerCrawler