diff --git a/README.md b/README.md index f434dcd..de0a075 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,31 @@ it is TypeScript implementation of [Obelisk](https://github.com/go-shiori/obelis ## Usage +### As CLI tool + +```sh +npm install -g @wabarc/cairn +``` + +```sh +$ cairn -h + +Usage: cairn [options] url1 [url2]...[urlN] + +CLI tool for saving web page as single HTML file + +Options: + -v, --version output the current version + -o, --output path to save archival result + -u, --user-agent set custom user agent + -t, --timeout maximum time (in second) request timeout + --no-js disable JavaScript + --no-css disable CSS styling + --no-embeds remove embedded elements (e.g iframe) + --no-medias remove media elements (e.g img, audio) + -h, --help display help for command +``` + ### As npm package ```sh @@ -25,42 +50,79 @@ npm install @wabarc/cairn ```javascript import { Cairn } from '@wabarc/cairn'; +// const cairn = require('@wabarc/cairn'); const cairn = new Cairn(); cairn .request({ url: url }) - .options({ userAgent: 'Cairn/1.0.0' }) + .options({ userAgent: 'Cairn/2.0.0' }) .archive() - .then((webpage) => { - console.log(url, webpage); + .then((archived) => { + console.log(archived.url, archived.webpage.html()); }) .catch((err) => console.warn(`${url} => ${JSON.stringify(err)}`)); ``` -### As CLI tool +#### Instance methods -```sh -npm install -g @wabarc/cairn +##### cairn#request({ url: string }): this +##### cairn#options({}): this +- userAgent?: string; +- disableJS?: boolean; +- disableCSS?: boolean; +- disableEmbeds?: boolean; +- disableMedias?: boolean; +- timeout?: number; + +##### cairn#archive(): Promise +##### cairn#Archived +- url: string; +- webpage: cheerio.Root; +- status: 200 | 400 | 401 | 403 | 404 | 500 | 502 | 503 | 504; +- contentType: 'text/html' | 'text/plain' | 'text/*'; + +#### Request Params + +##### request + +```javascript +{ + // `url` is archival target. + url: 'https://www.github.com' +} ``` -```sh -$ cairn -h +##### options -Usage: cairn [options] url1 [url2]...[urlN] +```javascript +{ + userAgent: 'Cairn/2.0.0', -CLI tool for saving web page as single HTML file + disableJS: true, + disableCSS: false, + disableEmbeds: false, + disableMedias: true, -Options: - -v, --version output the current version - -o, --output path to save archival result - -u, --user-agent set custom user agent - -t, --timeout maximum time (in second) request timeout - --no-js disable JavaScript - --no-css disable CSS styling - --no-embeds remove embedded elements (e.g iframe) - --no-medias remove media elements (e.g img, audio) - -h, --help display help for command + timeout: 30 +} +``` + +#### Response Schema + +for v1.x: + +The `archive` method will return webpage body as string. + +for v2.x: + +```javascript +{ + url: 'https://github.com/', + webpage: cheerio.Root, + status: 200, + contentType: 'text/html' +} ``` ## License diff --git a/package.json b/package.json index d730dd3..3d1db2f 100644 --- a/package.json +++ b/package.json @@ -37,11 +37,12 @@ }, "homepage": "https://github.com/wabarc/cairn#readme", "dependencies": { - "axios": "^0.20.0", - "commander": "^6.1.0", - "jsdom": "^16.4.0" + "axios": "^0.21.0", + "cheerio": "^1.0.0-rc.3", + "commander": "^6.1.0" }, "devDependencies": { + "@types/cheerio": "^0.22.22", "@types/jest": "^26.0.15", "@types/node": "^14.14.2", "@typescript-eslint/eslint-plugin": "^4.5.0", @@ -50,6 +51,7 @@ "eslint-plugin-jest": "^24.1.0", "eslint-plugin-prettier": "^3.1.4", "jest": "^26.6.1", + "jsdom": "^16.4.0", "nodemon": "^2.0.6", "prettier": "^2.1.2", "ts-jest": "^26.4.2", diff --git a/src/archiver.ts b/src/archiver.ts index 1f859cb..3bc0c64 100644 --- a/src/archiver.ts +++ b/src/archiver.ts @@ -1,5 +1,5 @@ -import { Archiver as ArchiverImpl, Options, Requests, Webpage } from './types/cairn'; -import { Err, HTTP, isValidURL } from './utils'; +import { Archiver as ArchiverImpl, Options, Requests, Archived } from './types/cairn'; +import { err, http, isValidURL } from './utils'; import { HTML } from './html'; export class Archiver implements ArchiverImpl { @@ -23,7 +23,7 @@ export class Archiver implements ArchiverImpl { request(r: Requests): this { const { url } = r; if (!isValidURL(url)) { - Err('request url is not specified'); + err('request url is not specified'); } this.req.url = url; @@ -50,43 +50,27 @@ export class Archiver implements ArchiverImpl { * @return {Promise} with string * @api public */ - async archive(): Promise { - return await (async () => { - let webpage: Webpage; - let content = ''; - let process = false; + async archive(): Promise { + const archived: Archived = { url: this.req.url, webpage: null, status: 400, contentType: 'text/html' }; + const response = await this.download(this.req.url).catch((err) => err(err)); + if (response.isAxiosError === true || !response.headers) { + return archived; + } + + const contentType = response.headers['content-type'] || response.headers['Content-Type'] || ''; + // Check the type of the downloaded file. + // If it's not HTML, just return it as it is. + if (contentType.includes('text/html') === true) { + // If it's HTML process it + archived.webpage = await new HTML(this.opt).process({ uri: this.req.url, html: response.data }); + } + archived.status = response.status || archived.status; + archived.contentType = contentType; - return await this.download(this.req.url) - .then((response) => { - // Check the type of the downloaded file. - // If it's not HTML, just return it as it is. - if (response.isAxiosError === true) { - return content; - } - if (!response.headers) { - return content; - } - const contentType = response.headers['content-type'] || response.headers['Content-Type'] || ''; - process = contentType.includes('text/html'); - webpage = { uri: this.req.url, content: response.data, contentType: contentType }; - }) - .then(async () => { - if (process === true) { - // If it's HTML process it - content = await new HTML(this.opt).process(webpage); - } - return content; - }) - .catch((err) => { - console.warn(err); - return content; - }); - })(); + return archived; } async download(url: string, referer?: string): Promise { - const http = new HTTP(); - if (this.opt.userAgent) { http.setHeader('User-Agent', this.opt.userAgent); } @@ -95,6 +79,6 @@ export class Archiver implements ArchiverImpl { http.setOptions({ timeout: this.opt.timeout }); } - return await http.fetch(url).catch((err) => Err(err)); + return await http.setResponseType('text').fetch(url); } } diff --git a/src/cairn.ts b/src/cairn.ts index 3b3b85d..4b1810d 100644 --- a/src/cairn.ts +++ b/src/cairn.ts @@ -1,43 +1,16 @@ -import { Options, Requests } from './types/cairn'; import { Archiver } from './archiver'; +export { Archived } from './types'; process.on('uncaughtException', (e) => { console.error(e); }); -class Cairn { - private arc: Archiver; +class Cairn extends Archiver {} - /** - * Initialize a new `Cairn`. - * - * @api public - */ - constructor() { - this.arc = new Archiver(); - } - - request(r: Requests): this { - this.arc.request(r); - - return this; - } - - options(o: Options): this { - this.arc.options(o); - return this; - } - - archive(): Promise { - return this.arc.archive(); - } -} +const cairn = new Cairn(); -exports = module.exports = new Cairn(); +exports = module.exports = cairn; exports.cairn = exports; - exports.Cairn = Cairn; -const cairn = new Cairn(); - export { Cairn, cairn }; diff --git a/src/cli.ts b/src/cli.ts index 4362b5a..8fb488f 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -1,8 +1,7 @@ #!/usr/bin/env node - import { Options } from './types/cairn'; import { Command } from 'commander'; -import { Cairn } from './cairn'; +import { Archiver } from './archiver'; import { isValidURL, createFileName } from './utils'; import { statSync, writeFile } from 'fs'; @@ -15,7 +14,7 @@ class Handler { this.opt = {}; } - main() { + async main() { const program = this.parser(); if (this.url.length < 1) { @@ -32,7 +31,7 @@ class Handler { filepath = program.output + '/'; } - const output = (url: string, filename: string, content: string) => { + const output = async (url: string, filename: string, content: string) => { if (program.output === '-') { console.info(content); } else { @@ -46,7 +45,7 @@ class Handler { } }; - const cairn = new Cairn(); + const cairn = new Archiver(); for (const url of this.url) { if (!isValidURL(url)) { console.info(`${url} => request url is not specified\n`); @@ -54,12 +53,21 @@ class Handler { } const filename = filepath + createFileName(url); - cairn + await cairn .request({ url: url }) .options(this.opt) .archive() - .then((webpage) => { - output(url, filename, webpage); + .then(async (archived) => { + if (!archived.webpage || typeof archived.webpage.root !== 'function') { + return; + } + + const html = archived.webpage.root() ? archived.webpage.root().html() : ''; + if (!html) { + console.warn(`${url} => archival failure. [status: ${archived.status}]`); + return; + } + await output(url, filename, html || ''); }) .catch((err) => console.warn(`${url} => ${JSON.stringify(err)}`)); } diff --git a/src/css.ts b/src/css.ts index 9ac11ba..4b2cf85 100644 --- a/src/css.ts +++ b/src/css.ts @@ -1,48 +1,41 @@ import { createAbsoluteURL, convertToData } from './utils'; -export class CSS { - async process(node: HTMLElement | HTMLStyleElement, uri = ''): Promise { - if (!node || typeof node !== 'object') { - return; +class CSS { + async process(style: string, baseURL: string): Promise { + if (style === undefined || typeof style !== 'string') { + return ''; } - const convert = async (u: string, n: string) => { - const assetURL = createAbsoluteURL(u, n); + const convert = async (url: string, baseURL: string): Promise => { + const assetURL = createAbsoluteURL(url, baseURL); const data = await convertToData(assetURL); - if (data.length > 0) { - const regex = u.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&'); - node.outerHTML = node.outerHTML.replace(new RegExp(regex, 'g'), data); - } - }; - const transform = async (text: string, regexp: RegExp) => { - const matches = [...new Set(text.matchAll(regexp))]; - for (const m of matches) { - if (m !== null && m.length > 0) { - await convert(m[0], uri); - } - } - return; + return data || ''; }; - const inlineStyle = node.getAttribute('style'); - if (inlineStyle && typeof inlineStyle === 'string') { - const regex = /(?<=url\((?!['"]?(?:data:)))\s*(['"]?)(.*)\1\)/gm; - await transform(inlineStyle, regex); - return; - } + style = style.replace(/\(['|"]/gm, '(').replace(/['|"]\)/gm, ')'); + const regexp = /(?<=url\().*?(?=\))/gm; + const matches = style.matchAll(regexp); - let block = node.innerHTML; - if (!block || typeof block !== 'string') { - return; + const rules = new Map(); + for (const m of matches) { + if (m !== null && m.length > 0) { + const resourceURL = m[0]; + const data = await convert(resourceURL, baseURL); + rules.set(resourceURL, Buffer.from(data).toString()); + } } - block = block.replace(/\(['|"]/gm, '('); - block = block.replace(/['|"]\)/gm, ')'); - - const regex = /(?<=url\().*?(?=\))/gm; - await transform(block, regex); + for (const [url, data] of rules) { + if (data.length > 0) { + const regex = url.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&'); + style = style.replace(new RegExp(regex, 'g'), data); + } + } - return; + return style; } } + +const css = new CSS(); +export { css }; diff --git a/src/html.ts b/src/html.ts index e144d39..a3b5dbd 100644 --- a/src/html.ts +++ b/src/html.ts @@ -1,8 +1,8 @@ -import { Options, Webpage } from './types'; -import { removeChild, isValidURL, createAbsoluteURL, convertToData } from './utils'; -import { CSS } from './css'; -import { URI } from './uri'; -import { JSDOM, VirtualConsole } from 'jsdom'; +import cheerio from 'cheerio'; +import { Options } from './types'; +import { css } from './css'; +import { uri } from './uri'; +import { err, isValidURL, createAbsoluteURL, convertToData } from './utils'; /** * @see https://html.spec.whatwg.org/multipage/semantics.html @@ -24,15 +24,16 @@ export class HTML { /** * Process assets within webpage * - * @param {Webpage} [Webpage] if error will be thrown - * @return {this} [Cairn] `this` command for chaning + * @param {Object} page if error will be thrown + * @return {cheerio.Cheerio} [cheerio.Cheerio] call .html() parse as html string * @api public */ - async process(page: Webpage): Promise { - const { content, uri } = page; - const virtualConsole = new VirtualConsole().on('jsdomError', (e) => console.log('JSDOM', e)); - const dom = new JSDOM(content, { virtualConsole }); - const doc = dom.window.document; + async process(page: { uri: string; html: string }): Promise { + const { html, uri } = page; + if (typeof html !== 'string' || typeof uri !== 'string') { + err('Cannot process webpage.'); + } + const $ = cheerio.load(html); // Prepare documents by doing these steps : // - Set Content-Security-Policy to make sure no unwanted request happened @@ -43,22 +44,20 @@ export class HTML { // - Convert relative URL into absolute URL // - Remove subresources integrity attribute from links // - Convert Open Graph Metadata - this.setContentSecurityPolicy(doc); - this.applyConfiguration(doc); - this.convertNoScriptToDiv(doc, true); - this.removeComments(doc); - this.convertLazyImageAttrs(doc); - this.convertRelativeURLs(doc, uri); - this.removeLinkIntegrityAttr(doc); - this.convertOpenGraph(doc); + this.setContentSecurityPolicy($); + this.applyConfiguration($); + this.convertNoScriptToDiv($, true); + this.removeComments($); + this.convertLazyImageAttrs($); + this.convertRelativeURLs($, uri); + this.removeLinkIntegrityAttr($); + this.convertOpenGraph($); // Find all nodes which might has subresource. // A node might has subresource if it fulfills one of these criteria : // - It has inline style; // - It's link for icon or stylesheets; // - It's tag name is either style, img, picture, figure, video, audio, source, iframe or object; - let tagName; - const nodes: HTMLElement[] = []; const tags = 'link,style,script,iframe,embed,object,img,picture,figure,video,audio,source'; const rels = [ 'icon', @@ -68,61 +67,64 @@ export class HTML { 'apple-touch-icon-precomposed', 'apple-touch-icon', ]; - doc.querySelectorAll(tags).forEach(function (currentNode) { - tagName = currentNode.tagName; - if (typeof tagName !== 'string') { - return; - } - switch (tagName.toLowerCase()) { - case 'link': { - const rel = currentNode.getAttribute('rel'); - if (rels.includes(rel)) { - nodes.push(currentNode); - } - break; + const nodes: cheerio.Element[] = []; + $.root() + .find(tags) + .each((_, e) => { + const tagName = e.name; + if (typeof tagName !== 'string') { + return; } - case 'style': - case 'script': - case 'iframe': - case 'embed': - case 'object': - case 'img': - case 'picture': - case 'figure': - case 'video': - case 'audio': - case 'source': { - nodes.push(currentNode); - break; + + const $elem = $(e); + switch (tagName.toLowerCase()) { + case 'link': { + const rel = $elem.attr('rel'); + if (typeof rel === 'string' && rels.includes(rel)) { + nodes.push(e); + } + break; + } + case 'style': + case 'script': + case 'iframe': + case 'embed': + case 'object': + case 'img': + case 'picture': + case 'figure': + case 'video': + case 'audio': + case 'source': { + nodes.push(e); + break; + } } - } - }); + }); - const css = new CSS(); for (const node of nodes) { - tagName = node.tagName; - if (node.hasAttributes() && node.getAttribute('style')) { - await css.process(node, uri); + const tagName = node.name; + if (Object.entries(node.attribs).length > 0 && $(node).attr('style')) { + await this.processStyleAttr($(node), uri); } - tagName = tagName.toLowerCase(); - switch (tagName) { + switch (tagName.toLowerCase()) { case 'style': { - await css.process(node, uri); + await this.processStyleNode($(node), uri); break; } case 'link': { - await this.processLinkNode(node, uri); + await this.processLinkNode($(node), uri); break; } case 'script': { - await this.processScriptNode(node, uri); + await this.processScriptNode($(node), uri); break; } case 'iframe': case 'embed': case 'object': { - await this.processEmbedNode(node, uri); + await this.processEmbedNode($(node), uri); break; } case 'img': @@ -131,17 +133,18 @@ export class HTML { case 'video': case 'audio': case 'source': { - await this.processMediaNode(node, uri); + await this.processMediaNode($(node), uri); break; } } } // Revert the converted noscripts - this.revertConvertedNoScript(doc); + this.revertConvertedNoScript($); - // return document back as string - return dom.serialize(); + // return cheerio.Root + // handle html() function convert to html string. + return $ || null; } /** @@ -149,12 +152,12 @@ export class HTML { * resources by setting Content-Security-Policy to only allow from * inline element and data URL. * - * @param {Document} doc JSDOM.window.document + * @param {Document} $ cheerio.Root * @api private */ - setContentSecurityPolicy(doc: Document): void { + setContentSecurityPolicy($: cheerio.Root): void { // Remove existing CSP - doc.querySelectorAll('meta[http-equiv="Content-Security-Policy"]').forEach((e) => removeChild(e)); + $('meta[http-equiv="Content-Security-Policy"]').remove(); const policies: string[] = ["default-src 'unsafe-inline' data:;", "connect-src 'none';"]; @@ -175,80 +178,77 @@ export class HTML { } // Append the new CSP - const head = doc.head; for (const policy of policies) { - const meta = doc.createElement('meta'); - meta.httpEquiv = 'Content-Security-Policy'; - meta.content = policy; - head.prepend(meta); + $('head').prepend(``); } } /** * Removes or replace elements following the configuration. * - * @param {Document} doc JSDOM.window.document + * @param {Document} $ cheerio.Root * @api private */ - applyConfiguration(doc: Document): void { + applyConfiguration($: cheerio.Root): void { if (this.opt.disableJS === true) { // Remove script tags - doc.querySelectorAll('script').forEach((e) => removeChild(e)); + $('script').remove(); // Remove links with javascript URL scheme - doc.querySelectorAll('a[href*="javascript:"]').forEach((e) => e.setAttribute('href', '#')); + $('a[href*="javascript:"]').attr('href', '#'); // Convert noscript to div - this.convertNoScriptToDiv(doc, false); + this.convertNoScriptToDiv($, false); } if (this.opt.disableCSS === true) { // Remove style tags - doc.querySelectorAll('style').forEach((e) => removeChild(e)); + $('style').remove(); // Remove inline style - doc.querySelectorAll('[style]').forEach((e) => e.removeAttribute('style')); + $('[style]').removeAttr('style'); } if (this.opt.disableEmbeds === true) { - doc.querySelectorAll('embed,object,iframe').forEach((e) => removeChild(e)); + $('embed,object,iframe').remove(); } if (this.opt.disableMedias === true) { - doc.querySelectorAll('img,picture,figure,video,audio,source').forEach((e) => removeChild(e)); + $('img,picture,figure,video,audio,source').remove(); } } /** * Convert all noscript to div element. * - * @param {Document} doc JSDOM.window.document + * @param {Document} $ cheerio.Root * @param {boolean} [markNewDiv] mark to noscript * @api private */ - convertNoScriptToDiv(doc: Document, markNewDiv = false): void { - doc.querySelectorAll('noscript').forEach((e: any) => { - const div = doc.createElement('div'); - div.innerHTML = e.innerHTML; - if (markNewDiv) { - div.setAttribute('data-cairn-noscript', 'true'); - } - e.parentNode.replaceChild(div, e); - }); + convertNoScriptToDiv($: cheerio.Root, markNewDiv = false): void { + if (markNewDiv) { + $('noscript').each((_, e) => { + e.tagName = 'div'; + $(e).attr('data-cairn-noscript', 'true'); + }); + } else { + $('noscript').each((_, e) => (e.tagName = 'div')); + } } /** * Find all comments in document then remove it. * - * @param {Document} doc JSDOM.window.document + * @param {Document} $ cheerio.Root * @api private */ - removeComments(doc: Document): void { - const nodeIterator = doc.createNodeIterator(doc, 128); // NodeFilter.SHOW_COMMENT - let currentNode; - while ((currentNode = nodeIterator.nextNode())) { - currentNode.remove(); - } + removeComments($: cheerio.Root): void { + $('*') + .contents() + .filter((_, e) => { + return e.type === 'comment'; + }) + .remove(); } /** @@ -256,14 +256,14 @@ export class HTML { * in lazy-loaded images and pictures, into basic attribute * src and srcset, so images that can be loaded without JS. * - * @param {Document} doc JSDOM.window.document + * @param {Document} $ cheerio.Root * @api private */ - convertLazyImageAttrs(doc: Document): void { + convertLazyImageAttrs($: cheerio.Root): void { // Convert img attributes - doc.querySelectorAll('img,picture,figure').forEach((e) => { - const src = (e).src; - const srcset = (e).srcset; + $('img,picture,figure').each((_, e) => { + const src = $(e).attr('src'); + const srcset = $(e).attr('srcset'); const tagName = e.tagName.toLowerCase(); // In some sites (e.g. Kotaku), they put 1px square image as data uri in @@ -275,20 +275,18 @@ export class HTML { // let srcCouldBeRemoved: boolean = false; // todo - if ((src !== '' || srcset !== '') && e.getAttribute('loading') === 'lazy') { + if ((src || srcset) && $(e).attr('loading') === 'lazy') { return; } - const attrs = e.attributes; - for (const attr of [...attrs]) { - if (attr.name === undefined) { + for (const [attrName, attrVal] of Object.entries(e.attribs)) { + if (attrName === undefined || typeof attrVal !== 'string') { continue; } - if (['src', 'srcset'].includes(attr.name.toLowerCase())) { + if (['src', 'srcset'].includes(attrName.toLowerCase())) { continue; } - const attrVal = attr.value; let copyTo = ''; if (this.rx.lazyImageSrcset.test(attrVal)) { copyTo = 'srcset'; @@ -301,19 +299,13 @@ export class HTML { } if (['img', 'picture'].includes(tagName)) { - e.setAttribute(copyTo, attrVal); - } else if (tagName === 'figure') { - const img = doc.createElement('img'); - img.setAttribute(copyTo, attrVal); - e.appendChild(img); + $(e).attr(copyTo, attrVal); + } else if (tagName === 'figure' && $(e).children('img, picture').length === 0) { + const img = ``; + $(e).append(img); } - e.removeAttribute(attr.name); - } - if (tagName === 'figure' && attrs.length === 0) { - const img = doc.createElement('img'); - // img.setAttribute(copyTo, attrVal); - e.appendChild(img); + $(e).removeAttr(attrName); } }); } @@ -323,11 +315,11 @@ export class HTML { * We do this for a, img, picture, figure, video, audio, source, link, * embed, iframe and object. * - * @param {Document} doc JSDOM.window.document + * @param {Document} $ cheerio.Root * @param {string} [url] original request url * @api private */ - convertRelativeURLs(doc: Document, url: string): void { + convertRelativeURLs($: cheerio.Root, url: string): void { const allowList: string[] = [ 'a', 'link', @@ -352,103 +344,126 @@ export class HTML { }; const mediaList = ['img', 'picture', 'figure', 'video', 'audio', 'source']; const convert = (node, attrName: string) => { - const oriURI = node.getAttribute(attrName); + const oriURI = $(node).attr(attrName); if (typeof oriURI === 'string') { let newVal: string = createAbsoluteURL(oriURI, url); try { newVal = decodeURI(newVal); } catch (_) {} - node.setAttribute(attrName, newVal); + $(node).attr(attrName, newVal); } }; - const nodeIterator = doc.createNodeIterator(doc.body); - let currentNode, tagName, attrName, name, srcset, newSrcset; - while ((currentNode = nodeIterator.nextNode())) { - tagName = currentNode.tagName; - if (typeof tagName !== 'string' || currentNode.hasAttributes() === false) { - continue; + $('*').each((_, e) => { + let tagName = e.tagName; + if (typeof tagName !== 'string' || Object.entries(e.attribs).length === 0) { + return; } - name = tagName.toLowerCase(); - if (allowList.includes(name) === false) { - continue; + + tagName = tagName.toLowerCase(); + if (allowList.includes(tagName) === false) { + return; } - if (slugs[name]) { - attrName = slugs[name]; - convert(currentNode, attrName); + if (slugs[tagName]) { + const attrName = slugs[tagName]; + convert(e, attrName); } - if (mediaList.includes(name)) { - convert(currentNode, 'src'); - convert(currentNode, 'poster'); + if (mediaList.includes(tagName)) { + convert(e, 'src'); + convert(e, 'poster'); - srcset = currentNode.getAttribute('srcset'); + const srcset = $(e).attr('srcset'); if (typeof srcset === 'string') { - newSrcset = createAbsoluteURL(srcset, url); + const newSrcset = createAbsoluteURL(srcset, url); try { - currentNode.setAttribute('srcset', decodeURI(newSrcset)); + $(e).attr('srcset', decodeURI(newSrcset)); } catch (_) { - continue; + return; } } } - } + }); } /** * Removes integrity attributes from link tags. * - * @param {Document} doc JSDOM.window.document + * @param {Document} $ cheerio.Root * @api private */ - removeLinkIntegrityAttr(doc: Document): void { - doc.querySelectorAll('link[integrity]').forEach((e) => { - e.removeAttribute('integrity'); - }); + removeLinkIntegrityAttr($: cheerio.Root): void { + $('link[integrity]').removeAttr('integrity'); } /** * Set og:title to title when it empty. * - * @param {Document} doc JSDOM.window.document + * @param {Document} $ cheerio.Root * @api private */ - convertOpenGraph(doc: Document): void { - let meta, attr, content, property; - const title = doc.head.querySelector('title'); + convertOpenGraph($: cheerio.Root): void { + const title = $('head > title').text().trim(); - doc.querySelectorAll('head > meta').forEach((e) => { - attr = e.getAttribute('property'); - content = e.getAttribute('content'); + $('head > meta').each((_, e) => { + const $elem = $(e); + const attr = $elem.attr('property'); + const content = $elem.attr('content'); if (attr && typeof attr === 'string' && attr.startsWith('og:')) { // real property - property = attr.substring(3); - meta = doc.createElement('meta'); - meta.setAttribute('property', property); - meta.setAttribute('content', content); - (e).parentNode.appendChild(meta); - - // replace title if it empty - if (title && title.innerHTML.trim().length < 1 && property.toLowerCase() === 'title') { - title.textContent = content; + const property = attr.substring(3); + if (!$elem.attr(property)) { + const meta = ``; + $elem.parent().append(meta); + + // replace title if it empty + if (title.length < 1 && property.toLowerCase() === 'title') { + $('head') + .remove($('title')) + .append(`${content}`); + } } } }); } - async processLinkNode(node: HTMLElement, baseURL = ''): Promise { - if (!node.hasAttribute('href')) { + async processStyleAttr(node: cheerio.Cheerio, baseURL = ''): Promise { + const style = node.attr('style'); + if (!style || style.length === 0) { + return; + } + + const newStyle = await css.process(style, baseURL); + if (newStyle.length > 0) { + node.attr('style', newStyle); + } + + return; + } + + async processStyleNode(node: cheerio.Cheerio, baseURL = ''): Promise { + const style = node.html(); + if (!style || style.length === 0) { return; } - const href = node.getAttribute('href'); - if (!href || typeof href !== 'string') { + const newStyle = await css.process(style, baseURL); + if (newStyle.length > 0) { + node.html(newStyle); + } + + return; + } + + async processLinkNode(node: cheerio.Cheerio, baseURL = ''): Promise { + const href = node.attr('href'); + if (!href || href.length === 0) { return; } - const rel = node.getAttribute('rel'); - if (typeof rel !== 'string') { + const rel = node.attr('rel'); + if (!rel || rel.length === 0) { return; } @@ -458,20 +473,16 @@ export class HTML { // Replace to `; + await uri.process(href, baseURL).then((data) => { + node.replaceWith(``); }); } return; } - async processURLNode(node: HTMLElement, attrName: string, baseURL: string): Promise { - if (!node.hasAttribute(attrName)) { - return; - } - - const url = node.getAttribute(attrName); + async processURLNode(node: cheerio.Cheerio, attrName: string, baseURL: string): Promise { + const url = node.attr(attrName); if (typeof url !== 'string' || url.trim().length < 1) { return; } @@ -479,31 +490,31 @@ export class HTML { const assetURL = createAbsoluteURL(url, baseURL); await convertToData(assetURL).then((data) => { if (data && typeof data === 'string' && data.trim().length > -1) { - node.setAttribute(attrName, data); + node.attr(attrName, data); } }); return; } - async processScriptNode(node: HTMLElement, baseURL: string): Promise { - const src = node.getAttribute('src'); + async processScriptNode(node: cheerio.Cheerio, baseURL: string): Promise { + const src = node.attr('src'); if (!src || typeof src !== 'string' || src.trim().length < 1) { return; } - await new URI().process(src, baseURL).then((data) => { - node.removeAttribute('src'); - node.textContent = data; + await uri.process(src, baseURL).then((data) => { + node.removeAttr('src'); + node.text(data); }); return; } - async processEmbedNode(node: HTMLElement, baseURL: string): Promise { - const attrName = node.tagName === 'OBJECT' ? 'data' : 'src'; + async processEmbedNode(node: cheerio.Cheerio, baseURL: string): Promise { + const attrName = node.get(0).tagName === 'object' ? 'data' : 'src'; - const url = node.getAttribute(attrName); + const url = node.attr(attrName); if (!url || typeof url !== 'string' || url.trim().length < 1) { return; } @@ -511,26 +522,26 @@ export class HTML { const assetURL = createAbsoluteURL(url, baseURL); await convertToData(assetURL).then((data) => { if (data && typeof data === 'string' && data.trim().length > -1) { - node.removeAttribute(attrName); - node.setAttribute(attrName, data); + node.removeAttr(attrName); + node.attr(attrName, data); } }); return; } - async processMediaNode(node: HTMLElement, baseURL: string): Promise { - const src = node.getAttribute('src'); + async processMediaNode(node: cheerio.Cheerio, baseURL: string): Promise { + const src = node.attr('src'); if (src && typeof src === 'string' && src.trim().length > 0) { await this.processURLNode(node, 'src', baseURL); } - const poster = node.getAttribute('poster'); + const poster = node.attr('poster'); if (poster && typeof poster === 'string' && poster.trim().length > 0) { await this.processURLNode(node, 'poster', baseURL); } - let srcset = node.getAttribute('srcset'); + let srcset = node.attr('srcset'); if (!srcset || typeof srcset !== 'string' || srcset.trim().length < 1) { return; } @@ -539,7 +550,7 @@ export class HTML { srcset = decodeURI(srcset); } catch (_) {} - const newSets: string[] = []; + let newSets: string[] = []; const matches = [...srcset.matchAll(this.rx.srcsetURL)]; for (const parts of matches) { if (!parts[1] || typeof parts[1] !== 'string') { @@ -557,21 +568,18 @@ export class HTML { newSets.push(newSet); } - node.setAttribute('srcset', newSets.join(',')); + node.attr('srcset', newSets.join(',')); + newSets = []; return; } - revertConvertedNoScript(doc: Document): void { - const divs = doc.getElementsByTagName('div'); - - for (const div of divs) { - const attr = div.getAttribute('data-cairn-noscript'); - if (attr === 'true' && div.parentNode) { - const noscript = doc.createElement('noscript'); - noscript.textContent = div.innerHTML; - div.parentNode.replaceChild(noscript, div); + revertConvertedNoScript($: cheerio.Root): void { + $('div').each((_, e) => { + if ($(e).attr('data-cairn-noscript') === 'true') { + e.tagName = 'noscript'; } - } + }); + return; } } diff --git a/src/types/cairn.ts b/src/types/cairn.ts index 3dead76..4d905eb 100644 --- a/src/types/cairn.ts +++ b/src/types/cairn.ts @@ -37,14 +37,15 @@ export declare type Options = { httpClient?: object; }; -export declare type Webpage = { - uri: string; - content: string; +export declare type Archived = { + url: string; + webpage: cheerio.Root | null; + status: 200 | 400 | 401 | 403 | 404 | 500 | 502 | 503 | 504; contentType: 'text/html' | 'text/plain' | 'text/*'; }; export interface Archiver { request(Requests): this; options(Options): this; - archive(): Promise; + archive(): Promise; } diff --git a/src/uri.ts b/src/uri.ts index 8b3c79c..a5ef8bf 100644 --- a/src/uri.ts +++ b/src/uri.ts @@ -1,17 +1,28 @@ -import { createAbsoluteURL, HTTP } from './utils'; +import { createAbsoluteURL, http } from './utils'; +import { css } from './css'; -export class URI { - async process(uri: string, baseURL: string): Promise { - if (uri.trim().length < 1) { - return ''; +class URI { + async process(url: string, baseURL: string): Promise { + let content = ''; + if (url.trim().length < 1) { + return content; } - const assetURL = createAbsoluteURL(uri, baseURL); - const response = await new HTTP().fetch(assetURL); + const assetURL = createAbsoluteURL(url, baseURL); + const response = await http.fetch(assetURL); if (typeof response !== 'object' || !Object.prototype.hasOwnProperty.call(response, 'data')) { - return ''; + return content; } + content = response.data; - return response.data; + const contentType = response.headers['content-type'] || ''; + if (contentType === 'text/css') { + content = await css.process(Buffer.from(content).toString(), baseURL); + } + + return content; } } + +const uri = new URI(); +export { uri }; diff --git a/src/utils/error.ts b/src/utils/error.ts index 02ca292..f9b7d55 100644 --- a/src/utils/error.ts +++ b/src/utils/error.ts @@ -1,3 +1,3 @@ -export const Err = (msg: string): never => { +export const err = (msg: string): never => { throw new Error(msg); }; diff --git a/src/utils/helper.ts b/src/utils/helper.ts index 4bd7a72..e354d5e 100644 --- a/src/utils/helper.ts +++ b/src/utils/helper.ts @@ -1,5 +1,5 @@ // import fs from 'fs'; -import { HTTP } from './http'; +import { http } from './http'; export const isValidURL = (uri: string): boolean => { if (!uri || uri.length < 3) { @@ -73,7 +73,7 @@ export const convertToData = async (uri: string): Promise => { return ''; } - const resource = await new HTTP().setResponseType('arraybuffer').fetch(uri); + const resource = await http.setResponseType('arraybuffer').fetch(uri); if (!resource || typeof resource !== 'object' || !Object.prototype.hasOwnProperty.call(resource, 'data')) { return ''; } diff --git a/src/utils/http.ts b/src/utils/http.ts index 6d955a9..ccf68f8 100644 --- a/src/utils/http.ts +++ b/src/utils/http.ts @@ -1,9 +1,9 @@ -import axios, { AxiosResponse, ResponseType } from 'axios'; +import axios, { ResponseType } from 'axios'; import { isValidURL } from '.'; -export class HTTP { +class HTTP { private timeout = 60; - private responseType: ResponseType = 'blob'; + private responseType: ResponseType = 'document'; constructor() { const ua = @@ -16,12 +16,6 @@ export class HTTP { } } - private async do(url: string): Promise { - return global.axios.get(url, { - responseType: this.responseType, - }); - } - setHeader(key: string, val: string | number): this { if (key.length < 1) { return this; @@ -48,10 +42,11 @@ export class HTTP { if (url.startsWith('data:') || url.startsWith('about:') || !isValidURL(url)) { return; } - return await this.do(url) - .then((response) => { - // response keys: status, statusText, headers, config, request, data - return response; + + // response keys: status, statusText, headers, config, request, data + return await global.axios + .get(url, { + responseType: this.responseType, }) .catch((err) => { if (err.response) { @@ -67,3 +62,6 @@ export class HTTP { }); } } + +const http = new HTTP(); +export { http }; diff --git a/test/archiver.test.ts b/test/archiver.test.ts index 8cfbe64..5327caa 100644 --- a/test/archiver.test.ts +++ b/test/archiver.test.ts @@ -2,8 +2,7 @@ import { Archiver } from '../src/archiver'; describe('Archiver', () => { const archiver = new Archiver(); - // const requests = { url: 'https://www.google.com/' }; - const requests = { url: 'https://en.wikipedia.org/wiki/Main_Page' }; + const requests = { url: 'https://www.google.com/' }; it('should called request function', () => { const request = archiver.request(requests); @@ -14,10 +13,9 @@ describe('Archiver', () => { await archiver .request(requests) .archive() - .then((webpage) => { - // console.log(webpage.content); - - expect(webpage.length > 1).toBe(true); + .then((archived) => { + expect(archived.url).toBe(requests.url); + expect(archived.status).toBe(200); }); }); }); diff --git a/test/process.test.ts b/test/process.test.ts index 6eec346..93d7d85 100644 --- a/test/process.test.ts +++ b/test/process.test.ts @@ -1,8 +1,8 @@ -import { Webpage } from '../src/types/cairn'; import { HTML } from '../src/html'; -import { CSS } from '../src/css'; +import { css } from '../src/css'; import { server } from './server'; import { JSDOM } from 'jsdom'; +import cheerio from 'cheerio'; const content = ` @@ -23,16 +23,16 @@ const content = ` `; -const webpage: Webpage = { +const webpage: { uri: string; html: string } = { uri: 'https://www.google.com/', - content: content, - contentType: 'text/html', + html: content, }; +let $ = cheerio.load(content); let dom = new JSDOM(content); let document = dom.window.document; -const html = new HTML(); +const html = new HTML({}); const port = 9112; /** @@ -58,10 +58,12 @@ describe('HTML', () => { meta.httpEquiv = 'Content-Security-Policy'; meta.content = 'some rules; report-uri /cairn/'; document.getElementsByTagName('head')[0].appendChild(meta); + $ = cheerio.load(dom.serialize()); // Remove existing CSP - html.setContentSecurityPolicy(document); - expect(dom.serialize()).toEqual(expect.not.stringContaining('cairn')); + html.setContentSecurityPolicy($); + expect($.root().html()).toEqual(expect.not.stringContaining('cairn')); + expect($.root().html()).toEqual(expect.stringContaining('Content-Security-Policy')); }); test('applyConfiguration', () => { @@ -73,6 +75,10 @@ describe('HTML', () => { a.href = 'javascript::onclick();'; a.style = 'color:blue;'; document.getElementsByTagName('body')[0].appendChild(a); + const b = document.createElement('a'); + b.href = 'javascript:;'; + b.style = 'color:blue;'; + document.getElementsByTagName('body')[0].appendChild(b); for (const tag of ['embed', 'iframe', 'object']) { const ele = document.createElement(tag); @@ -85,11 +91,13 @@ describe('HTML', () => { document.getElementsByTagName('body')[0].appendChild(ele); } + $ = cheerio.load(dom.serialize()); + // Remove script tag html.opt = { disableJS: true, disableCSS: true, disableEmbeds: true, disableMedias: true }; - html.applyConfiguration(document); + html.applyConfiguration($); - const raw = dom.serialize(); + const raw = $.root().html(); expect(raw).toEqual(expect.not.stringContaining('script')); expect(raw).toEqual(expect.not.stringContaining('javascript')); expect(raw).toEqual(expect.stringContaining('href="#"')); @@ -118,9 +126,11 @@ describe('HTML', () => { noscript.append(a); document.getElementsByTagName('body')[0].appendChild(noscript); + $ = cheerio.load(dom.serialize()); + // Convert noscript - html.convertNoScriptToDiv(document); - const raw = dom.serialize(); + html.convertNoScriptToDiv($); + const raw = $.root().html(); expect(raw).toEqual(expect.not.stringContaining('