-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.js
124 lines (116 loc) · 3.32 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// TODO provide url list + title : https://serpapi.com/sports-results
// TODO get page title : document.querySelector("#documentation > div.user-h1-wrap > div > h1")
// TODO document.querySelector(".integrationsMountPoint")
// TODO select language: document.querySelector("#integrationsMountPoint-7348 > div > div.code-header > div.code-links > div")
// TODO click to copy code document.querySelector("#integrationsMountPoint-7348 > div > div:nth-child(2) > ul > li > div > button")
const puppeteer = require('puppeteer');
const fs = require('fs')
const languages = [
"ruby",
"java",
"node",
"dotnet",
"go",
"php"
];
// extract the code from a page.
const getCode = async (url, page) => {
await page.goto(url, {
waitUntil: 'networkidle0'
});
const ctx = await page.evaluate(() => {
var ts = []
document.querySelectorAll('h4').forEach((t) => {
ts.push(t.innerText);
})
var ids = []
document.querySelectorAll('.integrationsMountPoint').forEach((div) => {
ids.push(div.id);
})
return {
title: ts,
ids: ids
}
});
// console.log(ctx)
for (var id of ctx.ids) {
group = {}
group.content = []
group.language = []
for (var language of languages) {
await page.select("#" + id + " > div > div.code-header > div.code-links > div > select", language)
await page.click("#" + id + " > div > div:nth-child(2) > ul > li > div > button")
var content = await page.evaluate(`(async () => await navigator.clipboard.readText())()`)
group.content.push(content)
group.language.push(language)
}
ctx[id] = group
}
return ctx
}
// mapping
var db = {
// google: {
// code: {},
// url: [
// "https://serpapi.com/sports-results"
// ]
// }
};
(async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
const context = await browser.defaultBrowserContext()
await page.goto("https://serpapi.com/search-api")
var links = await page.evaluate(() => {
var links = []
var examples = document.querySelectorAll(".dashboard-submenu-wrap > li > a")
for (var example of examples) {
links.push(example.href)
}
return links
})
const re = /^https:\/\/serpapi.com\/(youtube|yandex|walmart|ebay|bing|google|baidu|yahoo)/i
const skip = /^https:\/\/serpapi.com\/(searches|invoices|plan|users|extra-credits|manage-api-key|credit-card|dashboard|change-plan)/i
for (var link of links) {
if (skip.test(link)) {
continue
}
var engine = "google"
var r = re.exec(link)
if (r) {
engine = r[1]
}
if (db[engine] == null) {
db[engine] = {
code: [],
url: []
}
}
db[engine].url.push(link)
}
console.log(db)
for (var engine of Object.keys(db)) {
// debug only google
// if (!/google/.test(engine)) {
// continue
// }
for (var url of db[engine].url) {
await context.overridePermissions(url, ['clipboard-read'])
console.log("scrape " + url);
var code = await getCode(url, page).catch((err) => {
console.log(err);
});
db[engine].code.push(code)
}
}
// save
var path = "scrape.json"
try {
fs.writeFileSync(path, JSON.stringify(db))
} catch (err) {
console.error(err)
}
console.log("saved " + path)
browser.close();
})();