Reference Source

lib/frontier/index.js

/*
 Squidwarc  Copyright (C) 2017-present  John Berlin <n0tan3rd@gmail.com>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 Squidwarc is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this Squidwarc.  If not, see <http://www.gnu.org/licenses/>
 */
const FH = require('./helper')
const SeedTracker = require('./seedTracker')
const { cmodePO } = require('./modes')

/**
 * @desc In memory implementation of a frontier
 */
class Frontier {
  /**
   * @desc Create a new frontier object
   */
  constructor () {
    /**
     * @desc URLs to be crawled
     * @type {{url: string, mode: Symbol, cdepth: number, tracker: string}}[]}
     */
    this.queue = []

    /**
     * @desc Tracks the depth and crawl config per starting seed
     * @type {Map<string, SeedTracker>}
     */
    this.trackers = new Map()

    /**
     * @desc Information pertaining to the current URL being crawled
     * @type {?{url: string, mode: Symbol, cdepth: number, tracker: string}}
     */
    this.current = null
  }

  /**
   * @desc Initialize the initial frontier
   * @param {Seed[] | Seed} starting
   */
  init (starting) {
    if (Array.isArray(starting)) {
      let i = 0
      let len = starting.length
      let strt
      while (i < len) {
        strt = starting[i]
        this.trackers.set(strt.url, new SeedTracker(strt.url, strt.mode, strt.depth || 1))
        this.queue.push({
          url: strt.url,
          cdepth: 0,
          mode: strt.mode,
          tracker: strt.url
        })
        i++
      }
    } else {
      this.trackers.set(
        starting.url,
        new SeedTracker(starting.url, starting.mode, starting.depth || 1)
      )
      this.queue.push({
        url: starting.url,
        cdepth: 0,
        mode: starting.mode,
        tracker: starting.url
      })
    }
  }

  /**
   * @desc Returns the number of URLs left in the queue
   * @return {number}
   */
  size () {
    return this.queue.length
  }

  /**
   * @desc Is the frontier exhausted
   * @return {boolean}
   */
  exhausted () {
    return this.queue.length === 0
  }

  /**
   * @desc Get the next URL to crawl from the frontier, queue length - 1
   * @return {?string}
   */
  next () {
    this.current = this.queue.shift()
    if (this.current) {
      return this.current.url
    }
    return undefined
  }

  /**
   * @desc Process discovered outlinks of a page based on the originating seeds configuration
   * @param {Array<{href: string, pathname: string, host: string}>} links list of seeds to consider
   */
  process (links) {
    const tracker = this.trackers.get(this.current.tracker)
    tracker.crawledURL()
    if (this.current.mode !== cmodePO) {
      const nextDepth = this.current.cdepth + 1
      const nextMode = nextDepth < tracker.depth ? this.current.mode : cmodePO
      let i = links.length
      let url
      while (i--) {
        url = links[i]
        if (FH.shouldAddToFrontier(url, this.current.url, tracker)) {
          tracker.addToSeen(url.href)
          this.queue.push({
            url: url.href,
            cdepth: nextDepth,
            mode: nextMode,
            tracker: this.current.tracker
          })
        }
      }
    }
    if (tracker.done()) {
      this.trackers.delete(this.current.tracker)
    }
  }
}

/**
 * @type {Frontier}
 */
module.exports = Frontier