lib/injectManager/pageInjects/collectLinks.js
/*
Squidwarc Copyright (C) 2017-present John Berlin <n0tan3rd@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Squidwarc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this Squidwarc. If not, see <http://www.gnu.org/licenses/>
*/
/**
* @desc Function that is injected into every frame of the page currently being crawled that will
* setup the outlink collection depending if the frame injected into is the top frame or a sub frame.
*
* If this function is injected into the top frame an instance of Collector / TopHandler are created otherwise
* only an instance of Collector is created.
*
* In the case of injection into the top frame the `$$$$Squidwarc$$Collector$$$$` property will be defined on
* window with value of the created TopHandler instance and `message` event listener will be registered on window for
* receiving messages sent by this script when injected into child frames.
*
* Each child frame will send two messages (`indicateIsChild`, `outlinkgot`) and listen for one (`outlinkcollect`).
* The message types are found in the object m within the body of this function.
* The `indicateIsChild` message is sent immediately by a child frames to allow TopHandler can hold onto a reference to the frame for communicating with it.
* The `outlinkgot` message is sent by each child frame to the top frame once outlinks have been collected for that frame.
* The `outlinkcollect` message is sent by TopHandler to each child frame to have it start collecting outlinks.
* @return {void}
*/
exports.initCollectLinks = function initCollectLinks () {
let isIframe
try {
isIframe = window.self !== window.top
} catch (e) {
isIframe = true
}
/**
* @desc Performs the outlink collection for a frame
*/
class Collector {
constructor () {
this.ignore = [
'#',
'about:',
'data:',
'mailto:',
'javascript:',
'js:',
'{',
'*',
'ftp:',
'tel:'
]
this.good = {
'http:': true,
'https:': true
}
this.ilen = this.ignore.length
this.outlinks = []
this.links = []
this.linksSeen = new Set()
this.urlParer = new window.URL('about:blank')
this.urlParts = /^(https?:\/\/)?([^/]*@)?(.+?)(:\d{2,5})?([/?].*)?$/
this.dot = /\./g
}
static extractLinks () {
const collector = new Collector()
return collector.getOutlinks()
}
/**
* @desc Determines if the supplied URL is to be ignored or not
* @param {string} test - A URL
* @return {boolean}
*/
shouldIgnore (test) {
let ignored = false
for (let i = 0; i < this.ilen; ++i) {
if (test.startsWith(this.ignore[i])) {
ignored = true
break
}
}
if (!ignored) {
let parsed = true
try {
this.urlParer.href = test
} catch (error) {
parsed = false
}
return !(parsed && this.good[this.urlParer.protocol])
}
return ignored
}
/**
* @desc Collects the outlink information for a frame
* @return {{outlinks: string, links: Array<string>, location: string}}
*/
getOutlinks () {
const found = document.querySelectorAll(
'a[href],link[href],img[src],script[src],area[href]'
)
let flen = found.length
let elem
for (let i = 0; i < flen; ++i) {
elem = found[i]
switch (elem.nodeName) {
case 'LINK':
if (elem.href !== '') {
this.outlinks.push(`${elem.href} E link/@href\r\n`)
}
break
case 'IMG':
if (elem.src !== '') {
this.outlinks.push(`${elem.src} E =EMBED_MISC\r\n`)
}
break
case 'SCRIPT':
if (elem.src !== '') {
this.outlinks.push(`${elem.src} E script/@src\r\n`)
}
break
default:
let href = elem.href.trim()
if (href !== '' && href !== ' ') {
if (!this.shouldIgnore(href) && !this.linksSeen.has(href)) {
this.linksSeen.add(href)
this.links.push({
href,
pathname: this.urlParer.pathname,
host: this.urlParer.host
})
}
this.outlinks.push(`outlink: ${href} L a/@href\r\n`)
}
break
}
}
let location
try {
location = window.location.href
} catch (error) {}
return {
outlinks: this.outlinks.join(''),
links: this.links,
location
}
}
}
class TopHandler {
constructor (collectorRef, messages) {
/**
* @type {{outlinks: string, links: Array<string>, totalChildren: number}}
*/
this.found = {
outlinks: '',
links: [],
totalChildren: 0
}
this.collectorRef = collectorRef
this.messages = messages
this.done = null
this.childSources = []
this.childFrames = 0
this.countingChildren = true
this.to = null
this.toStop = false
this.go = this.go.bind(this)
this.helloFromFrame = this.helloFromFrame.bind(this)
this.finished = this.finished.bind(this)
}
/**
* @desc Returns a promise that resolves once outlink collection, from top frame and child frames is complete
* @return {Promise<{outlinks: string, links: Array<string>, totalChildren: number}>}
*/
prWhenDone () {
return new Promise(resolve => {
this.done = resolve
})
}
/**
* @desc Send the `outlinkcollect` message to all child frames and start the collection timeout
*/
go () {
this.countingChildren = false
this.found.totalChildren = this.childFrames
const cs = this.childSources
for (let i = 0; i < cs.length; ++i) {
let c = cs[i]
if (c && c.postMessage) {
c.postMessage({ type: this.messages.outlinkcollect }, '*')
}
}
this.to = setTimeout(this.finished, 20000)
}
/**
* @desc Listens for the `outlinkgot` message sent by each child frame that contains its outlink information
*/
helloFromFrame (e) {
if (e.data) {
if (
e.data.type === this.messages.indicateIsChild &&
e.origin &&
e.origin !== 'null' &&
this.countingChildren
) {
this.childFrames += 1
this.childSources.push(e.source)
} else if (e.data.type === this.messages.outlinkgot) {
this.found.outlinks += e.data.outlinks.outlinks
this.found.links = this.found.links.concat(e.data.outlinks.links)
this.childFrames -= 1
if (this.childFrames === 0 && !this.toStop) {
this.finished()
}
}
}
}
/**
* @desc Called once child frame outlink collection is complete. Collects the top frames outlinks and
* resolves the Promise that is being awaited by the crawler with the values of all outlinks collected
*/
finished () {
if (this.to) {
clearTimeout(this.to)
}
this.to = null
this.toStop = true
const { links, outlinks, location } = this.collectorRef.extractLinks()
this.found.outlinks += outlinks
this.found.location = location
this.found.links = this.found.links.concat(links)
this.done(this.found)
}
}
/**
* @type {{indicateIsChild: string, outlinkcollect: string, outlinkgot: string}}
*/
const m = {
indicateIsChild: '$$$$Squidwarc$$IsChild$$$$',
outlinkcollect: '$$$$Squidwarc$$CollectOutLinks$$$$',
outlinkgot: '$$$$Squidwarc$$GotOutlinks$$$$'
}
if (!isIframe) {
Object.defineProperty(window, '$$$$Squidwarc$$Collector$$$$', {
enumerable: false,
configurable: false,
value: new TopHandler(Collector, m)
})
window.addEventListener(
'message',
window.$$$$Squidwarc$$Collector$$$$.helloFromFrame,
false
)
} else {
const mhc = function messageHandlerChild (e) {
if (e.data && e.data.type === m.outlinkcollect) {
let outlinks
try {
outlinks = Collector.extractLinks()
} catch (e) {
outlinks = {
error: e.toString(),
outlinks: '',
links: [],
location: window.location.href
}
}
window.top.postMessage({ type: m.outlinkgot, outlinks }, '*')
}
}
window.addEventListener('message', mhc, false)
window.top.postMessage({ type: m.indicateIsChild }, '*')
}
}
/**
* @desc Starts the collection of the outlinks. Use only when {@link initCollectLinks} is pre-injected into every frame
* @return {Promise<{outlinks: string, links: Array<string>, location: string}>}
*/
exports.collect = function collect () {
const prom = window.$$$$Squidwarc$$Collector$$$$.prWhenDone()
// defer execution of go
Promise.resolve().then(() => window.$$$$Squidwarc$$Collector$$$$.go())
return prom
}
/**
* @desc Builds the WARC outlink metadata information and finds potential links to goto next from a page and build
* @return {Promise<{outlinks: string, links: Array<string>}>}
*/
exports.outLinks = async function outLinks () {
const ignore = [
'#',
'about:',
'data:',
'mailto:',
'javascript:',
'js:',
'{',
'*',
'ftp:',
'tel:'
]
const good = { 'http:': true, 'https:': true }
const outlinks = []
const links = []
const linksSeen = new Set()
const urlParer = new URL('about:blank')
function shouldIgnore (test) {
let ignored = false
let i = ignore.length
while (i--) {
if (test.startsWith(ignore[i])) {
ignored = true
break
}
}
if (!ignored) {
let parsed = true
try {
urlParer.href = test
} catch (error) {
parsed = false
}
return !(parsed && good[urlParer.protocol])
}
return ignored
}
const found = document.querySelectorAll(
'a[href],link[href],img[src],script[src],area[href]'
)
let elem
let i = found.length
while (i--) {
elem = found[i]
switch (elem.nodeName) {
case 'LINK':
if (elem.href !== '') {
outlinks.push(`${elem.href} E link/@href\r\n`)
}
break
case 'IMG':
if (elem.src !== '') {
outlinks.push(`${elem.src} E =EMBED_MISC\r\n`)
}
break
case 'SCRIPT':
if (elem.src !== '') {
outlinks.push(`${elem.src} E script/@src\r\n`)
}
break
default:
let href = elem.href.trim()
if (href !== '' && href !== ' ') {
if (!shouldIgnore(href) && !linksSeen.has(href)) {
linksSeen.add(href)
links.push({
href,
pathname: urlParer.pathname,
host: urlParer.host
})
}
outlinks.push(`outlink: ${href} L a/@href\r\n`)
}
break
}
}
return {
outlinks: outlinks.join(''),
links: links
}
}