Improve interface naming and update to version 1.1.1

supergillis · Feb 4, 2021 · 7c785f3 · 7c785f3
1 parent 21a22a0
commit 7c785f3
Show file tree

Hide file tree

Showing 36 changed files with 413 additions and 176 deletions.
diff --git a/README.md b/README.md
@@ -1,18 +1,14 @@
 # crawler-ts
 
-<p align="center">
-  Crawler written in TypeScript using ES6 generators.
-</p>
+Lightweight crawler written in TypeScript using ES6 generators.
 
-<p align="center">
-  <a href="https://www.npmjs.com/package/crawler-ts">
-    <img alt="npm" src="https://img.shields.io/npm/v/crawler-ts.svg?color=green"/>
-  </a>
-  <a href="https://bundlephobia.com/result?p=crawler-ts">
-    <img alt="bundle size" src="https://img.shields.io/bundlephobia/minzip/crawler-ts?label=bundle size"/>
-  </a>
-  <img alt="license" src="https://img.shields.io/npm/l/crawler-ts?label=license&color=green"/>
-</p>
+<a href="https://www.npmjs.com/package/crawler-ts">
+  <img alt="npm" src="https://img.shields.io/npm/v/crawler-ts.svg?color=green"/>
+</a>
+<a href="https://bundlephobia.com/result?p=crawler-ts">
+  <img alt="bundle size" src="https://img.shields.io/bundlephobia/minzip/crawler-ts?label=bundle size"/>
+</a>
+<img alt="license" src="https://img.shields.io/npm/l/crawler-ts?label=license&color=green"/>
 
 ## Installation
 
@@ -22,51 +18,63 @@ npm install --save crawler-ts crawler-ts-htmlparser2
 
 ## Examples
 
+- [Crawl NASA Mars News](./examples/mars-news/src/index.ts)
+- [Crawl Hacker News](./examples/hacker-news/src/index.ts)
 - [Crawl the file system](./examples/fs/src/index.ts)
-- [Crawl Github](./examples/http/src/index.ts)
 
 ## API
 
-The `crawl` function expects the following configuration as the first parameter.
+The `createCrawler` function expects the following options as the first parameter.
 
 ```typescript
 /**
- * @type {Location} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
- * @type {Response} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
- * @type {Result} The intermediate result that can be parsed from the response and generated by the crawler.
+ * @type {L} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
+ * @type {R} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
+ * @type {P} The intermediate parsed result that can be parsed from the response and generated by the crawler.
  */
-export interface Config<Location, Response, Result> {
+interface Options<L, R, P> {
   /**
    * This function should return the response for the given location.
    */
-  requester(loc: Location): Response | Promise<Response | undefined>;
+  requester(location: L): ValueOrPromise<R | undefined>;
   /**
    * This function should return true if the crawler should parse the response, or false if not.
    */
-  shouldParse(loc: Location, response: Response): boolean | Promise<boolean>;
+  shouldParse(props: PreParseProps<L, R>): ValueOrPromise<boolean>;
   /**
-   * This function should parse the response and convert the response to the result type.
+   * This function should parse the response and convert the response to the parsed type.
    */
-  parser(loc: Location, response: Response): Result | Promise<Result | undefined>;
+  parser(props: PreParseProps<L, R>): ValueOrPromise<P | undefined>;
   /**
-   * This function should return true if the crawler should yield the result, or false if not.
+   * This function should return true if the crawler should yield the parsed result, or false if not.
    */
-  shouldYield(result: Result): boolean | Promise<boolean>;
+  shouldYield(props: PostParseProps<L, R, P>): ValueOrPromise<boolean>;
   /**
-   * This function should yield all the locations to follow in the given result.
+   * This function should yield all the locations to follow in the given parsed result.
    */
-  follower(result: Result): AsyncGenerator<Location>;
+  follower(props: PostParseProps<L, R, P>): AsyncGenerator<L>;
   /**
    * This function should return true if the crawler should queue the location for crawling, or false if not.
    */
-  shouldQueue(loc: Location): boolean | Promise<boolean>;
+  shouldQueue(props: { location: L; origin: L; response: R; parsed: P }): ValueOrPromise<boolean>;
   /**
    * The logger can be set to `console` to output debug information to the `console`.
    *
    * @default undefined
    */
   logger?: Logger;
 }
+
+interface PreParseProps<L, R> {
+  location: L;
+  response: R;
+}
+
+interface PostParseProps<L, R, P> extends PreParseProps<L, R> {
+  parsed: P;
+}
+
+type ValueOrPromise<T> = T | Promise<T>;
 ```
 
 There are built-in modules available that implement some of these configuration values. See [Modules](.#modules) section.
@@ -84,7 +92,7 @@ There are built-in modules available that implement some of these configuration
   </a>
 </p>
 
-This module implements a requester that uses `node-fetch` to request content over HTTP.
+This module implements a `requester` that uses `node-fetch` to request content over HTTP.
 
 See [modules/crawler-ts-fetch](./modules/crawler-ts-fetch).
 
@@ -99,10 +107,23 @@ See [modules/crawler-ts-fetch](./modules/crawler-ts-fetch).
   </a>
 </p>
 
-This module implements a requester, parser and follower for HTML. The requester uses `crawler-ts-fetch` to request content over HTTP. The parser uses `htmlparser2` to parse HTML files. The follower uses the parser result to find `<a>` anchor elements and yields its `href` properties.
+This module implements a `requester`, `parser` and `follower` for HTML. The `requester` uses `crawler-ts-fetch` to request content over HTTP. The `parser` uses `htmlparser2` to parse HTML files. The `follower` uses the parser result to find `<a>` anchor elements and yields its `href` properties.
 
 See [modules/crawler-ts-htmlparser2](./modules/crawler-ts-htmlparser2).
 
+### crawler-ts-fs
+
+<p>
+  <a href="https://www.npmjs.com/package/crawler-ts-fs">
+    <img alt="npm" src="https://img.shields.io/npm/v/crawler-ts-fs.svg?color=green"/>
+  </a>
+  <a href="https://bundlephobia.com/result?p=crawler-ts-fs">
+    <img alt="bundle size" src="https://img.shields.io/bundlephobia/minzip/crawler-ts-fs?label=bundle size"/>
+  </a>
+</p>
+
+This module implements a `requester`, `parser` and `follower` for the file system. The `requester` uses `fs.stat` to request file information. The `parser` by default just returns the response from the `requester`. The `follower` follows directories.
+
 ## Author
 
 Gillis Van Ginderachter

diff --git a/examples/fs/package.json b/examples/fs/package.json
@@ -1,5 +1,5 @@
 {
-  "name": "crawler-ts-example-fs",
+  "name": "@crawler-ts/example-fs",
   "version": "1.0.0",
   "private": true,
   "author": {
@@ -12,7 +12,8 @@
     "url": "https://github.com/supergillis/crawler-ts.git"
   },
   "dependencies": {
-    "crawler-ts": "workspace:*"
+    "crawler-ts": "workspace:*",
+    "crawler-ts-fs": "workspace:*"
   },
   "devDependencies": {
     "@types/node": "12.x",

diff --git a/examples/fs/src/index.ts b/examples/fs/src/index.ts
@@ -1,6 +1,6 @@
 import * as fs from 'fs';
-import * as path from 'path';
-import { crawl, allowExtensions, ignoreDoubles, ignoreRegex, Logger } from 'crawler-ts/src';
+import { allowExtensions, ignoreDoubles, ignoreRegex, Logger } from 'crawler-ts/src';
+import { createCrawler } from 'crawler-ts-fs/src';
 
 const entryIsFile = ({ parsed }: { parsed: fs.Stats }) => parsed.isFile();
 
@@ -12,44 +12,19 @@ const pathIgnoreDoubles = ignoreDoubles();
 /**
  * File crawler that finds all ".ts" files.
  */
-const fileCrawler = () =>
-  crawl<string, fs.Stats, fs.Stats>({
-    // Use the file system to request paths
-    requester: fileRequester(),
+const createTypeScriptCrawler = () =>
+  createCrawler({
     // Ignore .git, dist and node_module files
     shouldParse: pathIgnoreRegex([/\/\.git$/, /\/dist$/, /\/node_modules$/]),
-    // No need for parsing, just return the response
-    parser: ({ response }) => response,
     // Only yield paths with extension ".ts" that are files
     shouldYield: ({ location, parsed }) => entryIsFile({ parsed }) && allowTypeScript({ location }),
-    // Follow files in a directory
-    follower: fileFollower(),
     // Ignore doubles
     shouldQueue: pathIgnoreDoubles(),
   });
 
-/**
- * Requester that requests file system stats for the path.
- */
-const fileRequester = () => async (location: string): Promise<fs.Stats> => fs.statSync(location);
-
-/**
- * Follower that follows all paths inside a directory.
- */
-const fileFollower = (logger?: Logger) =>
-  async function* ({ location, parsed }: { location: string; parsed: fs.Stats }) {
-    if (parsed.isDirectory()) {
-      logger?.info(`Following directory "${location}"`);
-      const entries = fs.readdirSync(location);
-      yield* entries.map((e) => path.resolve(location, e));
-    } else {
-      logger?.info(`Not following non-directory "${location}"`);
-    }
-  };
-
 async function main() {
   const root = process.argv?.[2] ?? process.cwd();
-  const crawler = fileCrawler();
+  const crawler = createTypeScriptCrawler();
 
   for await (const { location, parsed } of crawler(root)) {
     // Do something with the crawled result

diff --git a/examples/hacker-news/package.json b/examples/hacker-news/package.json
@@ -0,0 +1,25 @@
+{
+  "name": "@crawler-ts/example-mars-news",
+  "version": "1.0.0",
+  "private": true,
+  "author": {
+    "name": "Gillis Van Ginderacter",
+    "email": "[email protected]",
+    "url": "https://github.com/supergillis"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/supergillis/crawler-ts.git"
+  },
+  "dependencies": {
+    "crawler-ts": "workspace:*",
+    "crawler-ts-htmlparser2": "workspace:*",
+    "css-select": "^3.1.2",
+    "domhandler": "^4.0.0",
+    "domutils": "^2.4.4"
+  },
+  "devDependencies": {
+    "@types/node": "12.x",
+    "typescript": "^4.1.3"
+  }
+}
diff --git a/examples/hacker-news/src/index.ts b/examples/hacker-news/src/index.ts
@@ -0,0 +1,51 @@
+import type { Element } from 'domhandler';
+import { selectAll } from 'css-select';
+import { getAttributeValue, getText } from 'domutils';
+import { chain, allowRegex, ignoreDoubles } from 'crawler-ts/src';
+import { createCrawler, allowHtml, allowProtocols } from 'crawler-ts-htmlparser2/src';
+
+async function main() {
+  const hackerNewsPageRegex = /\/news\.ycombinator\.com\/news\?p=([\d]+)/;
+
+  const allowUrlRegex = allowRegex<URL>((url) => url.href);
+
+  // In this case we find the "?p=:page" piece in the URL and use it to detect duplicates
+  const ignorePageDoubles = ignoreDoubles<URL>((url) => {
+    const match = url.href.match(hackerNewsPageRegex);
+    const pageId = match?.[1];
+    return pageId ?? '';
+  });
+
+  // Only parse text/html
+  const shouldParse = allowHtml();
+
+  // Only queue links with
+  // - ignore already visited
+  const shouldQueue = chain(
+    allowProtocols(['http', 'https']),
+    // Allow news pages
+    allowUrlRegex([hackerNewsPageRegex]),
+    // Ignore already visited
+    ignorePageDoubles(),
+  );
+
+  const crawler = createCrawler({
+    shouldParse,
+    shouldQueue,
+    shouldYield: () => true,
+  });
+
+  const root = new URL('https://news.ycombinator.com/news');
+  for await (const { location, parsed } of crawler(root)) {
+    // Do something with the crawled result
+    const titleElements = selectAll('a.storylink', parsed);
+    const titles = titleElements.map((e) => ({
+      value: getText(e),
+      href: getAttributeValue(e as Element, 'href'),
+    }));
+    // Log all titles
+    titles.forEach((title) => console.log(title.href, title.value));
+  }
+}
+
+main();
diff --git a/examples/http/tsconfig.json → examples/hacker-news/tsconfig.json b/examples/http/tsconfig.json → examples/hacker-news/tsconfig.json
diff --git a/examples/http/package.json → examples/mars-news/package.json b/examples/http/package.json → examples/mars-news/package.json
@@ -1,5 +1,5 @@
 {
-  "name": "crawler-ts-example-http",
+  "name": "@crawler-ts/example-mars-news",
   "version": "1.0.0",
   "private": true,
   "author": {
@@ -13,7 +13,9 @@
   },
   "dependencies": {
     "crawler-ts": "workspace:*",
-    "crawler-ts-htmlparser2": "workspace:*"
+    "crawler-ts-htmlparser2": "workspace:*",
+    "css-select": "^3.1.2",
+    "domutils": "^2.4.4"
   },
   "devDependencies": {
     "@types/node": "12.x",

diff --git a/examples/http/src/index.ts → examples/mars-news/src/index.ts b/examples/http/src/index.ts → examples/mars-news/src/index.ts
@@ -1,5 +1,7 @@
+import { selectOne } from 'css-select';
+import { getText } from 'domutils';
 import { chain, allowRegex, ignoreDoubles } from 'crawler-ts/src';
-import { crawl, allowHtml, allowProtocols } from 'crawler-ts-htmlparser2/src';
+import { createCrawler, allowHtml, allowProtocols } from 'crawler-ts-htmlparser2/src';
 
 async function main() {
   const nasaMarsBlogRegex = /\/mars\.nasa\.gov\/news\/([\d]+)\//;
@@ -8,7 +10,7 @@ async function main() {
 
   // In this case we find the ":id" piece in the URL and use it to detect duplicates
   const ignoreMarsNewsDoubles = ignoreDoubles<URL>((url) => {
-    const match = url.pathname.match(/news\/([\d]+)\//);
+    const match = url.href.match(nasaMarsBlogRegex);
     const newsId = match?.[1];
     return newsId ?? '';
   });
@@ -26,7 +28,7 @@ async function main() {
     ignoreMarsNewsDoubles(),
   );
 
-  const crawler = crawl({
+  const crawler = createCrawler({
     shouldParse,
     shouldQueue,
     shouldYield: () => true,
@@ -35,7 +37,9 @@ async function main() {
   const root = new URL('https://mars.nasa.gov/news');
   for await (const { location, parsed } of crawler(root)) {
     // Do something with the crawled result
-    console.log(location.href, parsed.length);
+    const titleElement = selectOne('h1', parsed);
+    const title = titleElement ? getText(titleElement).trim() : 'N/A';
+    console.log(location.href, title);
   }
 }
 

diff --git a/modules/crawler-ts-htmlparser2/tsconfig.json → examples/mars-news/tsconfig.json b/modules/crawler-ts-htmlparser2/tsconfig.json → examples/mars-news/tsconfig.json
diff --git a/packages/crawler-ts-fetch/.gitignore b/packages/crawler-ts-fetch/.gitignore
@@ -0,0 +1 @@
+README.md
diff --git a/modules/crawler-ts-fetch/package.json → packages/crawler-ts-fetch/package.json b/modules/crawler-ts-fetch/package.json → packages/crawler-ts-fetch/package.json
@@ -1,6 +1,17 @@
 {
   "name": "crawler-ts-fetch",
-  "version": "1.1.0",
+  "version": "1.1.1",
+  "description": "Lightweight crawler written in TypeScript using ES6 generators.",
+  "keywords": [
+    "crawl",
+    "crawler",
+    "crawling-framework",
+    "crawling",
+    "es6-generators",
+    "typescript",
+    "web-crawler",
+    "web-crawling"
+  ],
   "author": {
     "name": "Gillis Van Ginderacter",
     "email": "[email protected]",

diff --git a/modules/crawler-ts-fetch/src/index.ts → packages/crawler-ts-fetch/src/index.ts b/modules/crawler-ts-fetch/src/index.ts → packages/crawler-ts-fetch/src/index.ts
diff --git a/modules/crawler-ts-fetch/tsconfig.esm.json → packages/crawler-ts-fetch/tsconfig.esm.json b/modules/crawler-ts-fetch/tsconfig.esm.json → packages/crawler-ts-fetch/tsconfig.esm.json
diff --git a/modules/crawler-ts-fetch/tsconfig.json → packages/crawler-ts-fetch/tsconfig.json b/modules/crawler-ts-fetch/tsconfig.json → packages/crawler-ts-fetch/tsconfig.json
diff --git a/packages/crawler-ts-fs/.gitignore b/packages/crawler-ts-fs/.gitignore
@@ -0,0 +1 @@
+README.md