1, Execise: Web Crawler
Solution #1: use channel to synchronize goroutines
- <cache *Cache> is used to avoid URL duplication
- <ch chan int> is used to wait for end of a goroutine
func Crawl(url string, depth int, fetcher Fetcher, cache *Cache, ch chan int) {
if depth <= 0 || cache.urls[url] {
ch <- 1
return
}
body, urls, err := fetcher.Fetch(url)
cache.mu.Lock()
cache.urls[url] = true
cache.mu.Unlock()
if err != nil {
fmt.Println(err)
ch <- 1
return
}
fmt.Printf("Found: %s %q\n", url, body)
subch := make(chan int, len(urls))
for _, u := range urls {
go Crawl(u, depth-1, fetcher, cache, subch)
<-subch
}
ch <- 1
return
}
type Cache struct {
mu sync.Mutex
urls map[string]bool
}
func main() {
cache := Cache{urls: make(map[string]bool)}
ch := make(chan int)
go Crawl("https://golang.org/", 4, fetcher, &cache, ch)
<-ch
}
Output looks like:
Found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
Found: https://golang.org/pkg/ "Packages"
Found: https://golang.org/pkg/os/ "Package os"
Found: https://golang.org/pkg/fmt/ "Package fmt"
Solution #2: use sync.WaitGroup to synchronize goroutines
refer to:
https://stackoverflow.com/questions/12224962/exercise-web-crawler-concurrency-not-working
https://www.dyxmq.cn/program/code/golang/waitgroup-in-golang.html
func Crawl(url string, depth int, fetcher Fetcher, cache *Cache, wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 || cache.urls[url] {
return
}
body, urls, err := fetcher.Fetch(url)
cache.mu.Lock()
cache.urls[url] = true
cache.mu.Unlock()
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("Found: %s %q\n", url, body)
for _, u := range urls {
wg.Add(1)
go Crawl(u, depth-1, fetcher, cache, wg)
}
return
}
type Cache struct {
mu sync.Mutex
urls map[string]bool
}
func main() {
cache := Cache{urls: make(map[string]bool)}
wg := &sync.WaitGroup{}
wg.Add(1)
go Crawl("https://golang.org/", 4, fetcher, &cache, wg)
wg.Wait()
}
The journey into 'A Tour of Go' ends here!