Reference Source

lib/utils/warcNaming.js

/*
 Squidwarc  Copyright (C) 2017 - present  John Berlin <n0tan3rd@gmail.com>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 Squidwarc is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this Squidwarc.  If not, see <http://www.gnu.org/licenses/>
 */

const fileNamify = require('filenamify-url')
const moment = require('moment')
const path = require('path')
const cp = require('./colorPrinters')

/**
 * @desc Class that initializes the warc naming function used when generating the warcs
 */
class WARCNaming {
  /**
   * @desc Returns a function that will concatenate the output path with the filenamified seedURL
   * producing the full path to WARC of the page being preserved
   * @param {string} outPath the full path to the WARC file output directory
   * @return {function(seedURL: string): string}
   */
  static warcNamePerURL (outPath) {
    return seedURL =>
      path.join(outPath, `${fileNamify(seedURL)}-${moment().format('MM-DD-YYYY_x')}.warc`)
  }

  /**
   * @desc Returns a function that provides the full path to WARC file being written to
   * @param {string} outPath  the full path to the WARC file output directory
   * @param {string} warcName the name of the WARC file to create
   * @return {function(): string}
   */
  static suppliedWarcName (outPath, warcName) {
    const warcFilePath = path.join(outPath, warcName)
    return () => warcFilePath
  }

  /**
   * @desc Returns a function that creates a WARC filename based on the first URL supplied to returned function
   * @param {string} outPath
   * @return {function(seedURL: string): string}
   */
  static apndWarcNamePerURL (outPath) {
    let fseed
    return function (seedURL) {
      if (fseed == null) {
        fseed = path.join(
          outPath,
          `${fileNamify(seedURL)}-${moment().format('MM-DD-YYYY_x')}.warc`
        )
      }
      return fseed
    }
  }

  /**
   * @desc Configures the function that produces the WARC(s) name
   * @param {CrawlConfig} options
   */
  static getWarcNamingFunction ({ warc }) {
    if (warc.naming.toLowerCase() === 'url') {
      cp.crawlerOpt('Crawler Will Be Generating WARC Files Using', 'the filenamified url')
      if (warc.append) return WARCNaming.apndWarcNamePerURL(warc.output)
      return WARCNaming.warcNamePerURL(warc.output)
    }
    warc.append = true
    cp.crawlerOpt('Crawler Will Be Generating A WARC File Named', warc.naming)
    return WARCNaming.suppliedWarcName(warc.output, warc.naming)
  }
}

module.exports = WARCNaming