Reference Source

lib/runners/puppeteerRunner.js

/*
 Squidwarc  Copyright (C) 2017  John Berlin <n0tan3rd@gmail.com>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 Squidwarc is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this Squidwarc.  If not, see <http://www.gnu.org/licenses/>
 */

const cp = require('../utils/colorPrinters')
const PuppeteerCrawler = require('../crawler/puppeteer')
const Frontier = require('../frontier')
const WARCNaming = require('../utils/warcNaming')

/**
 * @desc Launches a crawl using the supplied configuration file path
 * @param {CrawlConfig} conf - The crawl config for this crawl
 * @return {Promise<void, Error>}
 */
async function puppeteerRunner (conf) {
  const frontier = new Frontier()
  cp.crawlerOpt('Crawler Operating In', conf.mode, 'mode')
  if (conf.seeds == null) {
    cp.configError('No Seeds Were Provided Via The Config File', conf)
    cp.bred('Crawler Shutting Down. GoodBy')
    process.exit(0)
  }

  if (Array.isArray(conf.seeds)) {
    cp.crawlerOpt('Crawler Will Be Preserving', `${conf.seeds.length} Seeds`)
  } else {
    cp.crawlerOpt('Crawler Will Be Preserving', conf.seeds)
  }

  frontier.init(conf.seeds)
  const warcFilePath = WARCNaming.getWarcNamingFunction(conf)

  cp.crawlerOpt('Crawler Generated WARCs Will Be Placed At', conf.warc.output)

  const crawler = new PuppeteerCrawler(conf)
  let currentSeed

  crawler.on('error', async err => {
    cp.error('Crawler Encountered A Random Error', err.err)
  })

  crawler.on('disconnect', async () => {
    cp.bred('Crawlers Connection To The Remote Browser Has Closed')
    await crawler.shutdown()
  })

  await crawler.init()
  while (!frontier.exhausted()) {
    currentSeed = frontier.next()
    cp.cyan(`Crawler Navigating To ${currentSeed}`)
    const good = await crawler.navigate(currentSeed)
    if (good) {
      cp.cyan(`Crawler Navigated To ${currentSeed}`)
      await crawler.runUserScript()
      cp.cyan(`Crawler Generating WARC`)
      const donePromise = crawler.initWARC(warcFilePath(currentSeed), conf.warc.append)
      let { outlinks, links } = await crawler.getOutLinks()
      frontier.process(links)
      await crawler.genWarc({ outlinks })
      await donePromise
    }
    await crawler.stopPageLoading()
    cp.cyan(`Crawler Has ${frontier.size()} Seeds Left To Crawl`)
  }
  cp.cyan(`Crawler shutting down. Have nice day :)`)
  await crawler.shutdown()
}

module.exports = puppeteerRunner