Reference Source

lib/frontier/seedTracker.js

/*
 Squidwarc  Copyright (C) 2017-present  John Berlin <n0tan3rd@gmail.com>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 Squidwarc is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this Squidwarc.  If not, see <http://www.gnu.org/licenses/>
 */

/**
 * @desc Tracks the progress of crawl per starting seed URL. Because multiple seeds can be used,
 * each potentially generating additional URLs to crawl, SeedTrackers consolidate this process per
 * starting seed. Tracks the URLs discovered for a starting seed and allows for propagation of the crawl
 * mode throughout the entirety of the crawl.
 */
class SeedTracker {
  /**
   * @param {string} url - A starting seed
   * @param {Symbol} mode - The mode for the seed
   * @param {number} depth - The crawl depth
   */
  constructor (url, mode, depth) {
    /**
     * @desc How many URLs are left to crawl that originated from the starting seed
     * @type {number}
     */
    this.urlCount = 1

    /**
     * @desc The URL of the starting seed
     * @type {string}
     */
    this.url = url

    /**
     * @desc The crawl mode symbol the seed is operating under
     * @type {Symbol}
     */
    this.mode = mode

    /**
     * @desc A set of URLs used for duplication of URLs generated by this seed during the crawl
     * @type {Set<string>}
     */
    this.seen = new Set([url])

    /**
     * @desc The depth of the crawl for this seed
     * @type {number}
     */
    this.depth = depth
  }

  /**
   * @desc Are there no more URLs to be crawled that are associated with this seed
   * @returns {boolean}
   */
  done () {
    return this.urlCount === 0
  }

  /**
   * @desc Decreases the number of URLs left to crawl for this seed
   */
  crawledURL () {
    this.urlCount -= 1
  }

  /**
   * @desc Have we seen the supplied URL
   * @param {string} url - The URL to check if we have seen it
   * @returns {boolean}
   */
  seenURL (url) {
    return this.seen.has(url)
  }

  /**
   * @desc Adds a URL to the set of URLs seen and increments the seeds URL count
   * @param {string} url - The URL to mark as seen
   */
  addToSeen (url) {
    this.seen.add(url)
    this.urlCount += 1
  }
}

/**
 * @type {SeedTracker}
 */
module.exports = SeedTracker