-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.js
100 lines (84 loc) · 2.37 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
const base = 'https://www.morele.net';
const url = base + '/kategoria/klocki-lego-1045/'
const maxVisits = 40; // maximum of links visited
const visited = new Set();
const allProducts = [];
let q;
const queue = (concurrency = 5) => {
let running = 0;
const tasks = [];
return {
enqueue: async (task, ...params) => {
tasks.push({ task, params });
if (running >= concurrency) {
return;
}
++running;
while (tasks.length) {
const { task, params } = tasks.shift();
await task(...params);
}
--running;
},
};
};
const getHtml = async url => {
const { data } = await axios.get(url);
return data;
};
const extractContent = $ =>
$('.cat-product-content')
.map((_, product) => {
const $product = $(product);
const $price = $product.find('.cat-product-price .price-new').text().match(/\d+( ?\d*)*,?\d*/)
const $pieces = $product.find('.cat-product-feature:contains("Liczba elementów:")').text().match(/\d+/)
return {
title: $product.find('.cat-product-name a').attr('title'),
price: $price ? parseFloat($price[0].replaceAll(/ /g, '').replace(/,/g, '.')) : undefined,
pieces: $pieces ? parseInt($pieces[0]) : undefined,
pricePerPiece: ($price && $pieces) ? parseFloat($price[0].replaceAll(/ /g, '').replace(/,/g, '.'))/parseInt($pieces[0]) : undefined
};
})
.toArray();
const extractLinks = $ => [
...new Set(
$('.pagination-btn')
.map((_, a) => base + $(a).attr('href'))
.toArray()
),
];
const crawl = async url => {
visited.add(url);
console.log('Crawling: ', url);
const html = await getHtml(url);
const $ = cheerio.load(html);
const content = extractContent($);
const links = extractLinks($);
links
.filter(link => !visited.has(link))
.forEach(link => {
q.enqueue(crawlTask, link);
});
allProducts.push(...content);
};
const crawlTask = async url => {
if (visited.size >= maxVisits) {
finish()
return;
}
if (visited.has(url)) {
return;
}
await crawl(url);
};
const start = () => {
q = queue();
q.enqueue(crawlTask, url);
}
const finish = () => {
fs.writeFileSync('products.json', JSON.stringify(allProducts.sort((a, b) => a.pricePerPiece - b.pricePerPiece), null, 4))
}
start()