lib/crawler/navigationMan.js
/*
Squidwarc Copyright (C) 2017-present John Berlin <n0tan3rd@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Squidwarc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this Squidwarc. If not, see <http://www.gnu.org/licenses/>
*/
const EventEmitter = require('eventemitter3')
/**
* @desc Monitor navigation and request events for crawling a page.
* @emits {network-idle} when network idle has been detected or global-wait timer has fired
* @emits {navigated} when the browser has navigated
* @emits {navigation-timedout} when the browser has not navigated
* @extends {EventEmitter}
*/
class NavigationMan extends EventEmitter {
/**
*
* @param {CrawlControl} [options = {}]
* @param {EventEmitter} [parentEmitter]
*/
constructor (options = {}, parentEmitter) {
super()
/**
* @desc Maximum amount of time, in milliseconds, before generating a WARC and moving to the next URL
* @type {number}
* @private
*/
this._timeout = options.globalWait || 60000 // could be 30 seconds
/**
* @desc Amount of time, in milliseconds, that should elapse when there are only {@link _idleInflight} requests for network idle to be determined
* @type {number}
* @private
*/
this._idleTime = options.inflightIdle || 1000 // could be 1500 (1.5 seconds)
/**
* @desc The number of inflight requests (requests with no response) that should exist before starting the inflightIdle timer
* @type {number}
* @private
*/
this._idleInflight = options.numInflight || 2 // could be 4
/**
* @desc How long should we wait before for navigation to occur before emitting navigation-timedout event
* @type {number}
* @private
*/
this._navTimeoutTime = options.navWait || 8000
/**
* @desc Set of the HTTP requests ids, used for tracking network-idle
* @type {Set<string>}
* @private
*/
this._requestIds = new Set()
/**
* @desc The id of the setTimeout for the network-idle timer
* @type {?number}
* @private
*/
this._idleTimer = null
/**
* @desc Flag indicating if we are in a network tracking state of not
* @type {boolean}
* @private
*/
this._doneTimers = false
/**
* @desc The id of the global crawler setTimeout timer
* @type {?number}
* @private
*/
this._globalWaitTimer = null
/**
* @desc The id of the navigation setTimeout timer
* @type {?number}
* @private
*/
this._navTimeout = null
/**
* @desc An optional EventEmitter that we should emit this emitters events to rather than via ourselves
* @type {?EventEmitter}
* @private
*/
this._parentEmitter = parentEmitter
/**
* @desc The url of the page a crawler is visiting
* @type {?string}
* @private
*/
this._curl = null
this._networkIdled = this._networkIdled.bind(this)
this._globalNetworkTimeout = this._globalNetworkTimeout.bind(this)
this.didNavigate = this.didNavigate.bind(this)
this._navTimedOut = this._navTimedOut.bind(this)
this.reqFinished = this.reqFinished.bind(this)
this.reqStarted = this.reqStarted.bind(this)
}
/**
* @desc Start Timers For Navigation Monitoring
* @param {string} curl the URL browser is navigating to
*/
startedNav (curl) {
this._curl = curl
this._requestIds.clear()
this._doneTimers = false
this._navTimeout = setTimeout(this._navTimedOut, this._navTimeoutTime)
this._globalWaitTimer = setTimeout(this._globalNetworkTimeout, this._timeout)
}
/**
* @desc Indicate that a request was made
* @param {Object} info - CDP object received from Network.requestWillBeSent
* @see https://chromedevtools.github.io/devtools-protocol/tot/Network#event-requestWillBeSent
*/
reqStarted (info) {
if (!this._doneTimers) {
this._requestIds.add(info.requestId)
if (this._requestIds.size > this._idleInflight) {
clearTimeout(this._idleTimer)
this._idleTimer = null
}
}
}
/**
* @desc Indicate that a request has finished
* @param {Object} info - CDP Response object received by Network.responseReceived or Network.loadingFailed
* @see https://chromedevtools.github.io/devtools-protocol/tot/Network#event-responseReceived
* @see https://chromedevtools.github.io/devtools-protocol/tot/Network#event-loadingFailed
*/
reqFinished (info) {
if (!this._doneTimers) {
this._requestIds.delete(info.requestId)
if (this._requestIds.size <= this._idleInflight && !this._idleTimer) {
this._idleTimer = setTimeout(this._networkIdled, this._idleTime)
}
}
}
/**
* @desc Indicate that the browser has navigated to the current URL
*/
didNavigate () {
if (this._navTimeout) {
clearTimeout(this._navTimeout)
this._navTimeout = null
}
this._emitEvent('navigated', this._curl)
}
/**
* @desc Used to have the NavigationManger emit the 'navigation-error' event
* @param {Error | string} err
*/
navigationError (err) {
if (this._navTimeout) {
clearTimeout(this._navTimeout)
this._navTimeout = null
}
if (typeof err === 'string') {
this._emitEvent('navigation-error', new Error(err))
} else {
this._emitEvent('navigation-error', err)
}
}
/**
* @desc Called when the navigation time limit was hit
* @private
*/
_navTimedOut () {
if (this._navTimeout) {
clearTimeout(this._navTimeout)
this._navTimeout = null
}
this._emitEvent('navigation-timedout', this._curl)
}
/**
* @desc Called when the global time limit was hit
* @private
*/
_globalNetworkTimeout () {
this._clearTimers()
this._emitEvent('network-idle')
}
/**
* @desc Called when the network idle has been determined
* @private
*/
_networkIdled () {
this._clearTimers()
this._emitEvent('network-idle')
}
/**
* @desc Clear all timers
* @private
*/
_clearTimers () {
if (!this._doneTimers) {
this._doneTimers = true
}
if (this._globalWaitTimer) {
clearTimeout(this._globalWaitTimer)
this._globalWaitTimer = null
}
if (this._idleTimer) {
clearTimeout(this._idleTimer)
this._idleTimer = null
}
}
/**
* @desc Emit an event
* @param {string} event - The event name to be emitted
* @param [arg] - The value to be emitted for the event
* @private
*/
_emitEvent (event, arg) {
if (this._parentEmitter) {
this._parentEmitter.emit(event, arg)
} else {
this.emit(event, arg)
}
}
}
/**
* @type {NavigationMan}
*/
module.exports = NavigationMan