forked from torbiak/gopl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfindlinks.go
58 lines (53 loc) · 1.15 KB
/
findlinks.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// ex8.6 is a depth-limited web crawler.
//
// Use a WaitGroup to determine when the work is done, the `tokens` chan as a
// semaphore to limit concurrent requests, and a mutex around the `seen` map to
// avoid concurrent reads and writes.
package main
import (
"flag"
"fmt"
"log"
"sync"
"gopl.io/ch5/links"
)
// tokens is a counting semaphore used to
// enforce a limit of 20 concurrent requests.
var tokens = make(chan struct{}, 20)
var maxDepth int
var seen = make(map[string]bool)
var seenLock = sync.Mutex{}
func crawl(url string, depth int, wg *sync.WaitGroup) {
defer wg.Done()
fmt.Println(depth, url)
if depth >= maxDepth {
return
}
tokens <- struct{}{} // acquire a token
list, err := links.Extract(url)
<-tokens // release the token
if err != nil {
log.Print(err)
}
for _, link := range list {
seenLock.Lock()
if seen[link] {
seenLock.Unlock()
continue
}
seen[link] = true
seenLock.Unlock()
wg.Add(1)
go crawl(link, depth+1, wg)
}
}
func main() {
flag.IntVar(&maxDepth, "d", 3, "max crawl depth")
flag.Parse()
wg := &sync.WaitGroup{}
for _, link := range flag.Args() {
wg.Add(1)
go crawl(link, 0, wg)
}
wg.Wait()
}