lib/crawler/netIdleWatcher.js
/*
Squidwarc Copyright (C) 2017-present John Berlin <n0tan3rd@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Squidwarc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this Squidwarc. If not, see <http://www.gnu.org/licenses/>
*/
const EventEmitter = require('eventemitter3')
const { Page } = require('puppeteer/lib/Page')
const autobind = require('class-autobind')
const H = require('./helper')
/**
* @desc Monitors the HTTP requests made by a page and emits the 'network-idle' event when it has been determined the network is idle
* Used by {@link PuppeteerCrawler}
* @extends {EventEmitter}
*/
class NetIdleWatcher extends EventEmitter {
/**
* @param {Page} page - Puppeteer page object for the page being crawled
* @param {?NetIdleOptions} [options = {}] - Optional options to control fine tune network idle determination
*/
constructor (page, options = {}) {
super()
/**
* @desc Maximum amount of time a crawler going to visit a page
* @type {number}
* @private
*/
this._timeout = options.globalWait || 40000
/**
* @desc The amount of time no new HTTP requests should be made before emitting the network-idle event
* @type {number}
* @private
*/
this._idleTime = options.inflightIdle || 1500
/**
* @desc The number of in-flight requests there should be before starting the network-idle timer
* @type {number}
* @private
*/
this._idleInflight = options.numInflight || 2
/**
* @desc Set of the HTTP requests ids, used for tracking network-idle
* @type {Set<string>}
* @private
*/
this._requestIds = new Set()
/**
* @desc The id of the setTimeout for the network-idle timer
* @type {?number}
* @private
*/
this._idleTimer = null
/**
* @desc Flag indicating if we are in a network tracking state of not
* @type {boolean}
* @private
*/
this._doneTimers = false
/**
* @desc The id of the global crawler setTimeout timer
* @type {?number}
* @private
*/
this._globalWaitTimer = null
/**
* @desc The page object of the current page the crawler is visting
* @type {Page}
*/
this.page = page
/**
* @desc An array of listeners registered on the page object
* @type {{emitter: !EventEmitter, eventName: string, handler: function()}[]}
* @private
*/
this._pageListenrs = []
autobind.default(this, NetIdleWatcher.prototype)
}
/**
* @desc Start monitoring the network and receive a Promise that resolves once network idle occurred or the global wait time has been reached
* @param {Page} page - Puppeteer page object for the page being crawled
* @param {?NetIdleOptions} [options = {}] - Optional options to control fine tune network idle determination
* @return {Promise<void>}
*/
static idlePromise (page, options) {
const im = new NetIdleWatcher(page, options)
return new Promise((resolve, reject) => {
im.start()
im.on('network-idle', resolve)
})
}
/**
* @desc Setup the necessary listeners
*/
start () {
this._pageListenrs = [
H.addEventListener(this.page, Page.Events.Request, this.reqStarted),
H.addEventListener(this.page, Page.Events.Response, this.reqFinished),
H.addEventListener(this.page, Page.Events.RequestFailed, this.reqFinished)
]
this._requestIds.clear()
this._doneTimers = false
this._globalWaitTimer = setTimeout(this._globalNetworkTimeout, this._timeout)
}
/**
* @desc Indicate that a request was made
* @param {Request} info - Puppeteer Request object
*/
reqStarted (info) {
if (!this._doneTimers) {
this._requestIds.add(info._requestId)
if (this._requestIds.size > this._idleInflight) {
clearTimeout(this._idleTimer)
this._idleTimer = null
}
}
}
/**
* @desc Indicate that a request has finished
* @param {Response | Request} info - Puppeteer Request or Response object
*/
reqFinished (info) {
if (!this._doneTimers) {
if (info._requestId) {
this._requestIds.delete(info._requestId)
} else {
this._requestIds.delete(info.request()._requestId)
}
if (this._requestIds.size <= this._idleInflight && !this._idleTimer) {
this._idleTimer = setTimeout(this._networkIdled, this._idleTime)
}
}
}
/**
* @desc Called when the global time limit was hit
* @private
*/
_globalNetworkTimeout () {
if (!this._doneTimers) {
this._doneTimers = true
}
this._clearTimers()
process.nextTick(this._emitNetIdle)
}
/**
* @desc Called when the network idle has been determined
* @private
*/
_networkIdled () {
if (!this._doneTimers) {
this._doneTimers = true
}
this._clearTimers()
process.nextTick(this._emitNetIdle)
}
/**
* @desc Emit the network-idle event
* @private
*/
_emitNetIdle () {
H.removeEventListeners(this._pageListenrs)
this.emit('network-idle')
}
/**
* @desc Clear all timers
* @private
*/
_clearTimers () {
if (this._globalWaitTimer) {
clearTimeout(this._globalWaitTimer)
this._globalWaitTimer = null
}
if (this._idleTimer) {
clearTimeout(this._idleTimer)
this._idleTimer = null
}
}
}
/**
* @type {NetIdleWatcher}
*/
module.exports = NetIdleWatcher
/**
* @typedef {Object} NetIdleOptions
* @property {number} [globalWait = 40000] - Maximum amount of time, in milliseconds, to wait for network idle to occur
* @property {number} [numInflight = 2] - The number of inflight requests (requests with no response) that should exist before starting the inflightIdle timer
* @property {number} [inflightIdle = 1500] - Amount of time, in milliseconds, that should elapse when there are only numInflight requests for network idle to be determined
*/
/**
* @external {Page} https://pptr.dev/#?product=Puppeteer&version=v1.7.0&show=api-class-page
*/