Reference Source

lib/frontier/helper.js

/*
 Squidwarc  Copyright (C) 2017-present  John Berlin <n0tan3rd@gmail.com>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 Squidwarc is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this Squidwarc.  If not, see <http://www.gnu.org/licenses/>
 */
const Path = require('path')
const parseDomain = require('parse-domain')
const bigExtLookup = require('../utils/bigExtLookup')
const { cmodePO, cmodePAL, cmodePSD } = require('./modes')

/**
 * @desc Helper class providing utility functions for in memory frontier implementation {@link Frontier}
 */
class FrontierHelper {
  /**
   * @desc Ensure the starting seed list is one the frontier can understand
   * @param {Array<{url:string,mode:string,depth:number}|string>|{url:string,mode:string,depth:number}|string} seeds - The initial seeds for the crawl
   * @param {string} mode - The crawl mode for the crawl to be launched
   * @param {number} depth - The crawls depth
   * @returns {Seed | Seed[]} - The normalized {@link Seed}(s)
   */
  static normalizeSeeds (seeds, mode, depth = 1) {
    if (Array.isArray(seeds)) {
      return seeds.map(aSeed => {
        if (typeof aSeed === 'object') {
          return {
            url: aSeed.url,
            mode: FrontierHelper.crawlModeToSymbol(aSeed.mode || mode),
            depth: aSeed.depth || depth
          }
        } else if (typeof aSeed === 'string') {
          return {
            url: aSeed,
            mode: FrontierHelper.crawlModeToSymbol(mode),
            depth
          }
        }
      })
    } else if (typeof seeds === 'object') {
      return {
        url: seeds.url,
        mode: FrontierHelper.crawlModeToSymbol(seeds.mode || mode),
        depth: seeds.depth || depth
      }
    } else if (typeof seeds === 'string') {
      return {
        url: seeds,
        mode: FrontierHelper.crawlModeToSymbol(mode),
        depth
      }
    }
  }

  /**
   * @desc Retrieve the crawl-mode symbol from a configs string
   * @param {string} mode - The crawl mode
   * @returns {Symbol} - The crawl modes internal symbol
   */
  static crawlModeToSymbol (mode) {
    if (mode) {
      switch (mode) {
        case 'page-only':
        case 'po':
          return cmodePO
        case 'page-same-domain':
        case 'psd':
          return cmodePSD
        case 'page-all-links':
        case 'pal':
          return cmodePAL
        default:
          return cmodePO
      }
    } else {
      return cmodePO
    }
  }

  /**
   * @desc Determine if a URL should be added to the frontier
   * @param {Object} url - A URL extracted for the currently visited page
   * @param {string} curURL - The URL of the currently visited page
   * @param {SeedTracker} tracker - The seed tracker associated with the very first page the chain of pages being visited originated from
   * @returns {boolean}
   */
  static shouldAddToFrontier (url, curURL, tracker) {
    if (tracker.mode === cmodePSD) {
      return FrontierHelper.shouldAddToFrontierPSD(url, curURL, tracker)
    }
    return FrontierHelper.shouldAddToFrontierDefault(url, curURL, tracker)
  }

  /**
   * @desc Should a discovered URL be added to the frontier  using the Page Same Domain strategy
   * @param {Object} url - A URL extracted for the currently visited page
   * @param {string} curURL - The URL of the currently visited page
   * @param {SeedTracker} tracker - The seed tracker associated with the very first page the chain of pages being visited originated from
   * @returns {boolean}
   */
  static shouldAddToFrontierPSD (url, curURL, tracker) {
    const cDomain = parseDomain(curURL)
    const ext = Path.extname(url.pathname)
    const td = parseDomain(url.host)
    const tdTest = td && cDomain.domain === td.domain
    if (ext !== '') {
      return !bigExtLookup[ext] && tdTest && !tracker.seenURL(url.href)
    }
    return tdTest && !tracker.seenURL(url.href)
  }

  /**
   * @desc Should a discovered URL be added to the frontier using the default strategy, applies for page-only and page-all-links
   * @param {Object} url - A URL extracted for the currently visited page
   * @param {string} curURL - The URL of the currently visited page
   * @param {SeedTracker} tracker - The seed tracker associated with the very first page the chain of pages being visited originated from
   * @returns {boolean}
   */
  static shouldAddToFrontierDefault (url, curURL, tracker) {
    const ext = Path.extname(url.pathname)
    if (ext !== '') {
      return !bigExtLookup[ext] && !tracker.seenURL(url.href)
    }
    return !tracker.seenURL(url.href)
  }
}

/**
 * @type {FrontierHelper}
 */
module.exports = FrontierHelper

/**
 * @typedef {Object} Seed
 * @property {string} url   - The URL of the seed to be crawled
 * @property {Symbol} mode  - The mode the seed and the URLs discovered by crawl the seed should operate in
 * @property {number} depth - The depth of the crawl
 */