Skip to content

Commit

Permalink
chore(prettier): fix prettier formatting arc
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jul 20, 2024
1 parent f4276b5 commit 43bff3c
Show file tree
Hide file tree
Showing 49 changed files with 3,481 additions and 3,550 deletions.
15 changes: 15 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# EditorConfig helps developers define and maintain consistent
# coding styles between different editors or IDEs
# http://editorconfig.org
root = true

[*]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[*.md]
trim_trailing_whitespace = false
2 changes: 1 addition & 1 deletion .eslintrc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,4 @@ overrides:
'private-instance-method',
],
},
]
]
3 changes: 1 addition & 2 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ jobs:
# docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine
# build: |
# set -e &&
# rustup target add aarch64-unknown-linux-musl &&
# rustup target add aarch64-unknown-linux-musl &&
# yarn build --target aarch64-unknown-linux-musl
# env:
# CXXFLAGS: '--stdlib=libc++ -L/usr/lib/llvm-18/lib -static'
Expand Down Expand Up @@ -258,7 +258,6 @@ jobs:
run: ls -R .
shell: bash


test-linux-x64-gnu-binding:
name: Test bindings on Linux-x64-gnu - node@${{ matrix.node }}
needs:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/book.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ jobs:
if: ${{ github.ref == 'refs/heads/main' }}
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./book/book
publish_dir: ./book/book
2 changes: 2 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
target
.yarn
7 changes: 7 additions & 0 deletions .taplo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
exclude = ["node_modules/**/*.toml"]

# https://taplo.tamasfe.dev/configuration/formatter-options.html
[formatting]
align_entries = true
indent_tables = true
reorder_keys = true
4 changes: 2 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"rust-analyzer.procMacro.ignored": { "napi-derive": ["napi"] }
}
"rust-analyzer.procMacro.ignored": { "napi-derive": ["napi"] }
}
874 changes: 874 additions & 0 deletions .yarn/releases/yarn-3.6.4.cjs

Large diffs are not rendered by default.

894 changes: 0 additions & 894 deletions .yarn/releases/yarn-4.3.1.cjs

This file was deleted.

4 changes: 2 additions & 2 deletions .yarnrc.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
nodeLinker: node-modules

npmAuditRegistry: "https://registry.npmjs.org"
npmAuditRegistry: https://registry.npmjs.org

yarnPath: .yarn/releases/yarn-4.3.1.cjs
yarnPath: .yarn/releases/yarn-3.6.4.cjs
99 changes: 47 additions & 52 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,124 +7,119 @@ The [spider](https://github.com/spider-rs/spider) project ported to Node.js
1. `npm i @spider-rs/spider-rs --save`

```ts
import { Website, pageTitle } from "@spider-rs/spider-rs";
import { Website, pageTitle } from '@spider-rs/spider-rs'

const website = new Website("https://rsseau.fr")
const website = new Website('https://rsseau.fr')
.withHeaders({
authorization: "somerandomjwt",
authorization: 'somerandomjwt',
})
.withBudget({
"*": 20, // limit max request 20 pages for the website
"/docs": 10, // limit only 10 pages on the `/docs` paths
'*': 20, // limit max request 20 pages for the website
'/docs': 10, // limit only 10 pages on the `/docs` paths
})
.withBlacklistUrl(["/resume"]) // regex or pattern matching to ignore paths
.build();
.withBlacklistUrl(['/resume']) // regex or pattern matching to ignore paths
.build()

// optional: page event handler
const onPageEvent = (_err, page) => {
const title = pageTitle(page); // comment out to increase performance if title not needed
console.info(`Title of ${page.url} is '${title}'`);
const title = pageTitle(page) // comment out to increase performance if title not needed
console.info(`Title of ${page.url} is '${title}'`)
website.pushData({
status: page.statusCode,
html: page.content,
url: page.url,
title,
});
};
})
}

await website.crawl(onPageEvent);
await website.exportJsonlData("./storage/rsseau.jsonl");
console.log(website.getLinks());
await website.crawl(onPageEvent)
await website.exportJsonlData('./storage/rsseau.jsonl')
console.log(website.getLinks())
```

Collect the resources for a website.

```ts
import { Website } from "@spider-rs/spider-rs";
import { Website } from '@spider-rs/spider-rs'

const website = new Website("https://rsseau.fr")
const website = new Website('https://rsseau.fr')
.withBudget({
"*": 20,
"/docs": 10,
'*': 20,
'/docs': 10,
})
// you can use regex or string matches to ignore paths
.withBlacklistUrl(["/resume"])
.build();
.withBlacklistUrl(['/resume'])
.build()

await website.scrape();
console.log(website.getPages());
await website.scrape()
console.log(website.getPages())
```

Run the crawls in the background on another thread.

```ts
import { Website } from "@spider-rs/spider-rs";
import { Website } from '@spider-rs/spider-rs'

const website = new Website("https://rsseau.fr");
const website = new Website('https://rsseau.fr')

const onPageEvent = (_err, page) => {
console.log(page);
};
console.log(page)
}

await website.crawl(onPageEvent, true);
await website.crawl(onPageEvent, true)
// runs immediately
```

Use headless Chrome rendering for crawls.

```ts
import { Website } from "@spider-rs/spider-rs";
import { Website } from '@spider-rs/spider-rs'

const website = new Website("https://rsseau.fr").withChromeIntercept(
true,
true,
);
const website = new Website('https://rsseau.fr').withChromeIntercept(true, true)

const onPageEvent = (_err, page) => {
console.log(page);
};
console.log(page)
}

// the third param determines headless chrome usage.
await website.crawl(onPageEvent, false, true);
console.log(website.getLinks());
await website.crawl(onPageEvent, false, true)
console.log(website.getLinks())
```

Cron jobs can be done with the following.

```ts
import { Website } from "@spider-rs/spider-rs";
import { Website } from '@spider-rs/spider-rs'

const website = new Website("https://choosealicense.com").withCron(
"1/5 * * * * *",
);
const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *')
// sleep function to test cron
const stopCron = (time: number, handle) => {
return new Promise((resolve) => {
setTimeout(() => {
resolve(handle.stop());
}, time);
});
};
resolve(handle.stop())
}, time)
})
}

const links = [];
const links = []

const onPageEvent = (err, value) => {
links.push(value);
};
links.push(value)
}

const handle = await website.runCron(onPageEvent);
const handle = await website.runCron(onPageEvent)

// stop the cron in 4 seconds
await stopCron(4000, handle);
await stopCron(4000, handle)
```

Use the crawl shortcut to get the page content and url.

```ts
import { crawl } from "@spider-rs/spider-rs";
import { crawl } from '@spider-rs/spider-rs'

const { links, pages } = await crawl("https://rsseau.fr");
console.log(pages);
const { links, pages } = await crawl('https://rsseau.fr')
console.log(pages)
```

## Benchmarks
Expand Down
Loading

0 comments on commit 43bff3c

Please sign in to comment.