-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathchromedl.go
355 lines (301 loc) · 9.51 KB
/
chromedl.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
// Package ChromeDL uses chromedp to download the files. It may come handy when
// one needs to get a file from a protected website that doesn't allow regular
// methods, such as curl or http.Get().
//
// It is heavily based on https://github.com/chromedp/examples/tree/master/download_file
// with minor modifications.
package chromedl
import (
"bytes"
"context"
"io"
"io/ioutil"
"net/http"
"os"
"path/filepath"
"strings"
"sync"
"github.com/chromedp/cdproto/browser"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/cdproto/page"
"github.com/chromedp/chromedp"
"github.com/pkg/errors"
"github.com/rusq/dlog"
)
// tempPrefix is the prefix for the temp directory.
const tempPrefix = "chromedl"
// DefaultUA is the default user agent string that will be used by the browser
// instance.
const DefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
// Instance is the browser instance that will be used for downloading files.
type Instance struct {
cfg config
ctx context.Context // context with the browser
allocFn context.CancelFunc // allocator cancel func
browserFn context.CancelFunc // browser cancel func
lnCancel context.CancelFunc // listener cancel func
guidC chan string
requestIDC chan network.RequestID
mu sync.Mutex
requests map[network.RequestID]bool
tmpdir string
}
type config struct {
UserAgent string
}
type runnerFn = func(ctx context.Context, actions ...chromedp.Action) error
// to be able to mock in tests.
var runner runnerFn = chromedp.Run
type Option func(*config)
// OptUserAgent allows setting the user agent for the browser instance.
func OptUserAgent(ua string) Option {
return func(c *config) {
if ua == "" {
c.UserAgent = DefaultUA
}
c.UserAgent = ua
}
}
// New creates a new Instance, starting up the headless chrome to do the download.
// Once finished, call Stop to terminate the browser.
func New(options ...Option) (*Instance, error) {
cfg := config{
UserAgent: DefaultUA,
}
for _, opt := range options {
opt(&cfg)
}
opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.UserAgent(cfg.UserAgent),
)
allocCtx, aCancel := chromedp.NewExecAllocator(context.Background(), opts[:]...)
ctx, cCancel := chromedp.NewContext(allocCtx, chromedp.WithLogf(dlog.Printf), chromedp.WithDebugf(dlog.Debugf))
return newInstance(ctx, cfg, aCancel, cCancel)
}
func newInstance(ctx context.Context, cfg config, allocCFn, ctxCFn context.CancelFunc) (*Instance, error) {
tmpdir, err := ioutil.TempDir("", tempPrefix+"*")
if err != nil {
return nil, err
}
bi := Instance{
cfg: cfg,
ctx: ctx,
allocFn: allocCFn,
browserFn: ctxCFn,
guidC: make(chan string),
requestIDC: make(chan network.RequestID),
requests: map[network.RequestID]bool{},
tmpdir: tmpdir,
}
bi.startListener()
return &bi, nil
}
// ErrNoChrome indicates that there's no chrome instance in the context.
var ErrNoChrome = errors.New("no chrome instance in the context")
// NewWithChromeCtx creates new Instance for existing browser instance. Stop will not terminate
// the browser, but will cancel the event listener.
func NewWithChromeCtx(taskCtx context.Context, options ...Option) (*Instance, error) {
if chrome := chromedp.FromContext(taskCtx); chrome == nil {
return nil, ErrNoChrome
}
return newInstance(taskCtx, config{}, nil, nil)
}
func (bi *Instance) Stop() error {
bi.stopListener()
// close download channels
close(bi.guidC)
close(bi.requestIDC)
// cancel contexts if cancel functions are set
if bi.allocFn != nil {
bi.browserFn()
}
if bi.allocFn != nil {
bi.allocFn()
}
// remove temporary dir with any residual files
return os.RemoveAll(bi.tmpdir)
}
// Download downloads a file from the provided uri using the chromedp capabilities.
// It will return the reader with the file contents (buffered), and an error if
// any. If the error is present, reader may not be nil if the file was
// downloaded and read successfully. It will store the file in the temporary
// directory once the download is complete, then buffer it and try to cleanup
// afterwards. Set the timeout on context if required, by default no timeout is
// set. Optionally one can pass the configuration options for the downloader.
func Download(ctx context.Context, uri string, opts ...Option) (io.Reader, error) {
bi, err := New(opts...)
if err != nil {
return nil, err
}
defer bi.Stop()
return bi.Download(ctx, uri)
}
// Get is drop-in replacement for http.Get.
func Get(url string) (*http.Response, error) {
bi, err := New()
if err != nil {
return nil, err
}
defer bi.Stop()
return bi.Get(url)
}
// stopListener stops the Listener.
func (bi *Instance) stopListener() {
if bi.lnCancel == nil {
return
}
// cancel listener context
bi.mu.Lock()
defer bi.mu.Unlock()
bi.lnCancel()
bi.lnCancel = nil
}
func (bi *Instance) startListener() {
bi.mu.Lock()
defer bi.mu.Unlock()
lnctx, cancel := context.WithCancel(bi.ctx)
bi.lnCancel = cancel
chromedp.ListenTarget(lnctx, bi.eventHandler)
}
// eventHandler handles the download event.
func (bi *Instance) eventHandler(v interface{}) {
switch ev := v.(type) {
case *page.EventDownloadProgress:
dlog.Debugf(">>> current download state: %s", ev.State.String())
if ev.State == page.DownloadProgressStateCompleted {
bi.guidC <- ev.GUID
} else if ev.State == page.DownloadProgressStateCanceled {
bi.guidC <- ""
}
case *network.EventRequestWillBeSent:
dlog.Debugf(">>> EventRequestWillBeSent: %v: %v", ev.RequestID, ev.Request.URL)
bi.mu.Lock()
bi.requests[ev.RequestID] = true
bi.mu.Unlock()
case *network.EventLoadingFinished:
dlog.Debugf(">>> EventLoadingFinished: %v", ev.RequestID)
if bi.requests[ev.RequestID] {
bi.requestIDC <- ev.RequestID
bi.mu.Lock()
delete(bi.requests, ev.RequestID)
bi.mu.Unlock()
}
// TODO handle nework.EventLoadingFailed
default:
dlog.Debugf("*** EVENT: %[1]T\n", v)
}
}
// Download downloads the file returning the reader with contents.
func (bi *Instance) Download(ctx context.Context, uri string) (io.Reader, error) {
return bi.download(ctx, uri)
}
func (bi *Instance) download(ctx context.Context, uri string) (*bytes.Buffer, error) {
if err := bi.navigate(ctx, uri); err != nil {
return nil, err
}
return bi.waitTransfer(ctx)
}
// Get partly emulates http.Get to some extent and is meant to be drop-in
// replacement for http.Get in the callers code.
func (bi *Instance) Get(url string) (*http.Response, error) {
return bi.get(context.Background(), url)
}
func (bi *Instance) get(ctx context.Context, url string) (*http.Response, error) {
buf, err := bi.download(ctx, url)
if err != nil {
return nil, err
}
req, _ := http.NewRequest("GET", url, nil)
resp := http.Response{
Status: http.StatusText(http.StatusOK),
StatusCode: http.StatusOK,
Proto: "HTTP/1.0",
ProtoMajor: 1,
ProtoMinor: 0,
Body: io.NopCloser(buf),
ContentLength: int64(buf.Len()),
Close: true,
Uncompressed: true,
Request: req,
}
return &resp, nil
}
func (bi *Instance) navigate(ctx context.Context, uri string) error {
var errC = make(chan error, 1)
go func() {
errC <- runner(bi.ctx,
chromedp.ActionFunc(func(ctx context.Context) error {
scriptID, err := page.AddScriptToEvaluateOnNewDocument(script).Do(ctx)
if err != nil {
return err
}
dlog.Debugf("scriptID: %s", scriptID)
return nil
}),
browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).WithDownloadPath(bi.tmpdir),
chromedp.Navigate(uri),
)
}()
select {
case err := <-errC:
if err != nil && !strings.Contains(err.Error(), "net::ERR_ABORTED") {
// Note: Ignoring the net::ERR_ABORTED page error is essential here since downloads
// will cause this error to be emitted, although the download will still succeed.
return errors.WithStack(err)
}
case <-bi.ctx.Done():
return errors.WithStack(bi.ctx.Err())
case <-ctx.Done():
return errors.WithStack(ctx.Err())
}
return nil
}
// waitTransfer waits to receive the completed download from either guid channel
// or request ID channel. Then it does what it takes to open the received data,
// and returns the bytes.Buffer with data.
func (bi *Instance) waitTransfer(ctx context.Context) (*bytes.Buffer, error) {
// Listening to both available channes to return the download.
var (
b []byte
err error
)
select {
case <-ctx.Done():
return nil, errors.WithStack(ctx.Err())
case <-bi.ctx.Done():
return nil, errors.WithStack(bi.ctx.Err())
case filename := <-bi.guidC:
if filename == "" {
return nil, errors.New("download was cancelled")
}
b, err = bi.readFile(filename)
case reqID := <-bi.requestIDC:
b, err = bi.readRequest(reqID)
}
return bytes.NewBuffer(b), err
}
func (bi *Instance) readFile(name string) ([]byte, error) {
// We can predict the exact file location and name here because of how we configured
// SetDownloadBehavior and WithDownloadPath
downloadPath := filepath.Join(bi.tmpdir, name)
b, err := ioutil.ReadFile(downloadPath)
if err != nil {
return nil, errors.WithStack(err)
}
dlog.Debugf("Download Complete: %s", downloadPath)
if err := os.Remove(downloadPath); err != nil {
return b, err
}
return b, nil
}
func (bi *Instance) readRequest(reqID network.RequestID) ([]byte, error) {
var b []byte
if err := runner(bi.ctx, chromedp.ActionFunc(func(ctx context.Context) error {
var err error
b, err = network.GetResponseBody(reqID).Do(ctx)
return errors.WithStack(err)
})); err != nil {
return nil, errors.WithStack(err)
}
return b, nil
}