Skip to content

Commit

Permalink
Migrate to cheerio
Browse files Browse the repository at this point in the history
  • Loading branch information
web-flow committed Oct 25, 2020
1 parent 3dc3c00 commit c82d560
Show file tree
Hide file tree
Showing 15 changed files with 664 additions and 447 deletions.
102 changes: 82 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,31 @@ it is TypeScript implementation of [Obelisk](https://github.com/go-shiori/obelis

## Usage

### As CLI tool

```sh
npm install -g @wabarc/cairn
```

```sh
$ cairn -h

Usage: cairn [options] url1 [url2]...[urlN]

CLI tool for saving web page as single HTML file

Options:
-v, --version output the current version
-o, --output <string> path to save archival result
-u, --user-agent <string> set custom user agent
-t, --timeout <number> maximum time (in second) request timeout
--no-js disable JavaScript
--no-css disable CSS styling
--no-embeds remove embedded elements (e.g iframe)
--no-medias remove media elements (e.g img, audio)
-h, --help display help for command
```

### As npm package

```sh
Expand All @@ -25,42 +50,79 @@ npm install @wabarc/cairn

```javascript
import { Cairn } from '@wabarc/cairn';
// const cairn = require('@wabarc/cairn');

const cairn = new Cairn();

cairn
.request({ url: url })
.options({ userAgent: 'Cairn/1.0.0' })
.options({ userAgent: 'Cairn/2.0.0' })
.archive()
.then((webpage) => {
console.log(url, webpage);
.then((archived) => {
console.log(archived.url, archived.webpage.html());
})
.catch((err) => console.warn(`${url} => ${JSON.stringify(err)}`));
```

### As CLI tool
#### Instance methods

```sh
npm install -g @wabarc/cairn
##### cairn#request({ url: string }): this
##### cairn#options({}): this
- userAgent?: string;
- disableJS?: boolean;
- disableCSS?: boolean;
- disableEmbeds?: boolean;
- disableMedias?: boolean;
- timeout?: number;

##### cairn#archive(): Promise<Archived>
##### cairn#Archived
- url: string;
- webpage: cheerio.Root;
- status: 200 | 400 | 401 | 403 | 404 | 500 | 502 | 503 | 504;
- contentType: 'text/html' | 'text/plain' | 'text/*';

#### Request Params

##### request

```javascript
{
// `url` is archival target.
url: 'https://www.github.com'
}
```

```sh
$ cairn -h
##### options

Usage: cairn [options] url1 [url2]...[urlN]
```javascript
{
userAgent: 'Cairn/2.0.0',

CLI tool for saving web page as single HTML file
disableJS: true,
disableCSS: false,
disableEmbeds: false,
disableMedias: true,

Options:
-v, --version output the current version
-o, --output <string> path to save archival result
-u, --user-agent <string> set custom user agent
-t, --timeout <number> maximum time (in second) request timeout
--no-js disable JavaScript
--no-css disable CSS styling
--no-embeds remove embedded elements (e.g iframe)
--no-medias remove media elements (e.g img, audio)
-h, --help display help for command
timeout: 30
}
```

#### Response Schema

for v1.x:

The `archive` method will return webpage body as string.

for v2.x:

```javascript
{
url: 'https://github.com/',
webpage: cheerio.Root,
status: 200,
contentType: 'text/html'
}
```

## License
Expand Down
8 changes: 5 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@
},
"homepage": "https://github.com/wabarc/cairn#readme",
"dependencies": {
"axios": "^0.20.0",
"commander": "^6.1.0",
"jsdom": "^16.4.0"
"axios": "^0.21.0",
"cheerio": "^1.0.0-rc.3",
"commander": "^6.1.0"
},
"devDependencies": {
"@types/cheerio": "^0.22.22",
"@types/jest": "^26.0.15",
"@types/node": "^14.14.2",
"@typescript-eslint/eslint-plugin": "^4.5.0",
Expand All @@ -50,6 +51,7 @@
"eslint-plugin-jest": "^24.1.0",
"eslint-plugin-prettier": "^3.1.4",
"jest": "^26.6.1",
"jsdom": "^16.4.0",
"nodemon": "^2.0.6",
"prettier": "^2.1.2",
"ts-jest": "^26.4.2",
Expand Down
58 changes: 21 additions & 37 deletions src/archiver.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { Archiver as ArchiverImpl, Options, Requests, Webpage } from './types/cairn';
import { Err, HTTP, isValidURL } from './utils';
import { Archiver as ArchiverImpl, Options, Requests, Archived } from './types/cairn';
import { err, http, isValidURL } from './utils';
import { HTML } from './html';

export class Archiver implements ArchiverImpl {
Expand All @@ -23,7 +23,7 @@ export class Archiver implements ArchiverImpl {
request(r: Requests): this {
const { url } = r;
if (!isValidURL(url)) {
Err('request url is not specified');
err('request url is not specified');
}

this.req.url = url;
Expand All @@ -50,43 +50,27 @@ export class Archiver implements ArchiverImpl {
* @return {Promise} with string
* @api public
*/
async archive(): Promise<string> {
return await (async () => {
let webpage: Webpage;
let content = '';
let process = false;
async archive(): Promise<Archived> {
const archived: Archived = { url: this.req.url, webpage: null, status: 400, contentType: 'text/html' };
const response = await this.download(this.req.url).catch((err) => err(err));
if (response.isAxiosError === true || !response.headers) {
return archived;
}

const contentType = response.headers['content-type'] || response.headers['Content-Type'] || '';
// Check the type of the downloaded file.
// If it's not HTML, just return it as it is.
if (contentType.includes('text/html') === true) {
// If it's HTML process it
archived.webpage = await new HTML(this.opt).process({ uri: this.req.url, html: response.data });
}
archived.status = response.status || archived.status;
archived.contentType = contentType;

return await this.download(this.req.url)
.then((response) => {
// Check the type of the downloaded file.
// If it's not HTML, just return it as it is.
if (response.isAxiosError === true) {
return content;
}
if (!response.headers) {
return content;
}
const contentType = response.headers['content-type'] || response.headers['Content-Type'] || '';
process = contentType.includes('text/html');
webpage = { uri: this.req.url, content: response.data, contentType: contentType };
})
.then(async () => {
if (process === true) {
// If it's HTML process it
content = await new HTML(this.opt).process(webpage);
}
return content;
})
.catch((err) => {
console.warn(err);
return content;
});
})();
return archived;
}

async download(url: string, referer?: string): Promise<any> {
const http = new HTTP();

if (this.opt.userAgent) {
http.setHeader('User-Agent', this.opt.userAgent);
}
Expand All @@ -95,6 +79,6 @@ export class Archiver implements ArchiverImpl {
http.setOptions({ timeout: this.opt.timeout });
}

return await http.fetch(url).catch((err) => Err(err));
return await http.setResponseType('text').fetch(url);
}
}
35 changes: 4 additions & 31 deletions src/cairn.ts
Original file line number Diff line number Diff line change
@@ -1,43 +1,16 @@
import { Options, Requests } from './types/cairn';
import { Archiver } from './archiver';
export { Archived } from './types';

process.on('uncaughtException', (e) => {
console.error(e);
});

class Cairn {
private arc: Archiver;
class Cairn extends Archiver {}

/**
* Initialize a new `Cairn`.
*
* @api public
*/
constructor() {
this.arc = new Archiver();
}

request(r: Requests): this {
this.arc.request(r);

return this;
}

options(o: Options): this {
this.arc.options(o);
return this;
}

archive(): Promise<string> {
return this.arc.archive();
}
}
const cairn = new Cairn();

exports = module.exports = new Cairn();
exports = module.exports = cairn;
exports.cairn = exports;

exports.Cairn = Cairn;

const cairn = new Cairn();

export { Cairn, cairn };
24 changes: 16 additions & 8 deletions src/cli.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/usr/bin/env node

import { Options } from './types/cairn';
import { Command } from 'commander';
import { Cairn } from './cairn';
import { Archiver } from './archiver';
import { isValidURL, createFileName } from './utils';
import { statSync, writeFile } from 'fs';

Expand All @@ -15,7 +14,7 @@ class Handler {
this.opt = {};
}

main() {
async main() {
const program = this.parser();

if (this.url.length < 1) {
Expand All @@ -32,7 +31,7 @@ class Handler {
filepath = program.output + '/';
}

const output = (url: string, filename: string, content: string) => {
const output = async (url: string, filename: string, content: string) => {
if (program.output === '-') {
console.info(content);
} else {
Expand All @@ -46,20 +45,29 @@ class Handler {
}
};

const cairn = new Cairn();
const cairn = new Archiver();
for (const url of this.url) {
if (!isValidURL(url)) {
console.info(`${url} => request url is not specified\n`);
continue;
}
const filename = filepath + createFileName(url);

cairn
await cairn
.request({ url: url })
.options(this.opt)
.archive()
.then((webpage) => {
output(url, filename, webpage);
.then(async (archived) => {
if (!archived.webpage || typeof archived.webpage.root !== 'function') {
return;
}

const html = archived.webpage.root() ? archived.webpage.root().html() : '';
if (!html) {
console.warn(`${url} => archival failure. [status: ${archived.status}]`);
return;
}
await output(url, filename, html || '');
})
.catch((err) => console.warn(`${url} => ${JSON.stringify(err)}`));
}
Expand Down
Loading

0 comments on commit c82d560

Please sign in to comment.