package main import ( "github.com/PuerkitoBio/goquery" //解析html "os" "container/list" "github.com/gogather/com/log" "fmt" "strings" "net/url" "net/http" "errors" ) func isurlok(uri *url.URL,prefix string) bool { if strings.HasPrefix(uri.Hostname(),prefix) { return true } return false } func fetch(target,prefix string) ([]string, string, error) { uri, err := url.Parse(target) if err != nil { return nil, "", err } if !isurlok(uri,prefix) { return nil, "", errors.New("skip " + target) } resp, err := http.Get(target) if err != nil { return nil, "", err } defer resp.Body.Close() doc, err := goquery.NewDocumentFromResponse(resp) if err != nil { return nil, "", err } var urls []string doc.Find("a").Each(func(i int, s *goquery.Selection) { link, ok := s.Attr("href") if !ok { return } if len(link) > 0 && link[0] == '/' { u := uri u.Path = link u.RawQuery = "" u.Fragment = "" urls = append(urls, u.String()) } else { urls = append(urls, link) } }) body, err := doc.Html() if err != nil { return urls, "", nil } return urls, body, nil } func main() { //广度优先搜索 //爬到一个网页后,爬该页面的所有的子链接。优先搜索本站,再搜索外站。大部分使用广度优先。 webroot:=os.Args[1] prefix:=os.Args[2] visited :=make(map[string]bool) //创建一个双向链表 lst:=list.New() lst.PushBack(webroot) for lst.Len()!=0{ front := lst.Front() lst.Remove(front) url:=front.Value.(string) if visited[url]{ continue } visited[url] =true urls,body,err:=fetch(url,prefix) if err!=nil{ log.Println(err) continue } fmt.Printf("%s %0.2fk\n", url, float32(len(body))/1024.0) for _, url := range urls { lst.PushBack(url) } } }
编译好的可执行程序:https://download.csdn.net/download/mypc2010/10407523
用法:spider.exe http://www.jd.com jd.com
拓展阅读:https://www.zhihu.com/question/23011311