lib/frontier/seedTracker.js
/*
Squidwarc Copyright (C) 2017-present John Berlin <n0tan3rd@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Squidwarc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this Squidwarc. If not, see <http://www.gnu.org/licenses/>
*/
/**
* @desc Tracks the progress of crawl per starting seed URL. Because multiple seeds can be used,
* each potentially generating additional URLs to crawl, SeedTrackers consolidate this process per
* starting seed. Tracks the URLs discovered for a starting seed and allows for propagation of the crawl
* mode throughout the entirety of the crawl.
*/
class SeedTracker {
/**
* @param {string} url - A starting seed
* @param {Symbol} mode - The mode for the seed
* @param {number} depth - The crawl depth
*/
constructor (url, mode, depth) {
/**
* @desc How many URLs are left to crawl that originated from the starting seed
* @type {number}
*/
this.urlCount = 1
/**
* @desc The URL of the starting seed
* @type {string}
*/
this.url = url
/**
* @desc The crawl mode symbol the seed is operating under
* @type {Symbol}
*/
this.mode = mode
/**
* @desc A set of URLs used for duplication of URLs generated by this seed during the crawl
* @type {Set<string>}
*/
this.seen = new Set([url])
/**
* @desc The depth of the crawl for this seed
* @type {number}
*/
this.depth = depth
}
/**
* @desc Are there no more URLs to be crawled that are associated with this seed
* @returns {boolean}
*/
done () {
return this.urlCount === 0
}
/**
* @desc Decreases the number of URLs left to crawl for this seed
*/
crawledURL () {
this.urlCount -= 1
}
/**
* @desc Have we seen the supplied URL
* @param {string} url - The URL to check if we have seen it
* @returns {boolean}
*/
seenURL (url) {
return this.seen.has(url)
}
/**
* @desc Adds a URL to the set of URLs seen and increments the seeds URL count
* @param {string} url - The URL to mark as seen
*/
addToSeen (url) {
this.seen.add(url)
this.urlCount += 1
}
}
/**
* @type {SeedTracker}
*/
module.exports = SeedTracker