-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve interface naming and update to version 1.1.1
- Loading branch information
Gillis Van Ginderachter
committed
Feb 4, 2021
1 parent
21a22a0
commit 7c785f3
Showing
36 changed files
with
413 additions
and
176 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"name": "@crawler-ts/example-mars-news", | ||
"version": "1.0.0", | ||
"private": true, | ||
"author": { | ||
"name": "Gillis Van Ginderacter", | ||
"email": "[email protected]", | ||
"url": "https://github.com/supergillis" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/supergillis/crawler-ts.git" | ||
}, | ||
"dependencies": { | ||
"crawler-ts": "workspace:*", | ||
"crawler-ts-htmlparser2": "workspace:*", | ||
"css-select": "^3.1.2", | ||
"domhandler": "^4.0.0", | ||
"domutils": "^2.4.4" | ||
}, | ||
"devDependencies": { | ||
"@types/node": "12.x", | ||
"typescript": "^4.1.3" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import type { Element } from 'domhandler'; | ||
import { selectAll } from 'css-select'; | ||
import { getAttributeValue, getText } from 'domutils'; | ||
import { chain, allowRegex, ignoreDoubles } from 'crawler-ts/src'; | ||
import { createCrawler, allowHtml, allowProtocols } from 'crawler-ts-htmlparser2/src'; | ||
|
||
async function main() { | ||
const hackerNewsPageRegex = /\/news\.ycombinator\.com\/news\?p=([\d]+)/; | ||
|
||
const allowUrlRegex = allowRegex<URL>((url) => url.href); | ||
|
||
// In this case we find the "?p=:page" piece in the URL and use it to detect duplicates | ||
const ignorePageDoubles = ignoreDoubles<URL>((url) => { | ||
const match = url.href.match(hackerNewsPageRegex); | ||
const pageId = match?.[1]; | ||
return pageId ?? ''; | ||
}); | ||
|
||
// Only parse text/html | ||
const shouldParse = allowHtml(); | ||
|
||
// Only queue links with | ||
// - ignore already visited | ||
const shouldQueue = chain( | ||
allowProtocols(['http', 'https']), | ||
// Allow news pages | ||
allowUrlRegex([hackerNewsPageRegex]), | ||
// Ignore already visited | ||
ignorePageDoubles(), | ||
); | ||
|
||
const crawler = createCrawler({ | ||
shouldParse, | ||
shouldQueue, | ||
shouldYield: () => true, | ||
}); | ||
|
||
const root = new URL('https://news.ycombinator.com/news'); | ||
for await (const { location, parsed } of crawler(root)) { | ||
// Do something with the crawled result | ||
const titleElements = selectAll('a.storylink', parsed); | ||
const titles = titleElements.map((e) => ({ | ||
value: getText(e), | ||
href: getAttributeValue(e as Element, 'href'), | ||
})); | ||
// Log all titles | ||
titles.forEach((title) => console.log(title.href, title.value)); | ||
} | ||
} | ||
|
||
main(); |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
README.md |
13 changes: 12 additions & 1 deletion
13
modules/crawler-ts-fetch/package.json → packages/crawler-ts-fetch/package.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,17 @@ | ||
{ | ||
"name": "crawler-ts-fetch", | ||
"version": "1.1.0", | ||
"version": "1.1.1", | ||
"description": "Lightweight crawler written in TypeScript using ES6 generators.", | ||
"keywords": [ | ||
"crawl", | ||
"crawler", | ||
"crawling-framework", | ||
"crawling", | ||
"es6-generators", | ||
"typescript", | ||
"web-crawler", | ||
"web-crawling" | ||
], | ||
"author": { | ||
"name": "Gillis Van Ginderacter", | ||
"email": "[email protected]", | ||
|
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
README.md |
Oops, something went wrong.