-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.ts
46 lines (38 loc) · 1.37 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import { selectOne } from 'css-select';
import { getText } from 'domutils';
import { chain, allowRegex, ignoreDoubles } from 'crawler-ts/src';
import { createCrawler, allowHtml, allowProtocols } from 'crawler-ts-htmlparser2/src';
async function main() {
const nasaMarsBlogRegex = /\/mars\.nasa\.gov\/news\/([\d]+)\//;
const allowUrlRegex = allowRegex<URL>((url) => url.href);
// In this case we find the ":id" piece in the URL and use it to detect duplicates
const ignoreMarsNewsDoubles = ignoreDoubles<URL>((url) => {
const match = url.href.match(nasaMarsBlogRegex);
const newsId = match?.[1];
return newsId ?? '';
});
// Only parse text/html
const shouldParse = allowHtml();
// Only queue links with
// - ignore already visited
const shouldQueue = chain(
allowProtocols(['http', 'https']),
// Allow news pages
allowUrlRegex([nasaMarsBlogRegex]),
// Ignore already visited
ignoreMarsNewsDoubles(),
);
const crawler = createCrawler({
shouldParse,
shouldQueue,
shouldYield: () => true,
});
const root = new URL('https://mars.nasa.gov/news');
for await (const { location, parsed } of crawler(root)) {
// Do something with the crawled result
const titleElement = selectOne('h1', parsed);
const title = titleElement ? getText(titleElement).trim() : 'N/A';
console.log(location.href, title);
}
}
main();