golang并发版的爬虫

package main

import (
	"fmt"
	"io"
	"net/http"
	"os"
	"strconv"
)

var(
	spiderchan = make(chan bool)
)


func main() {
	//爬取一个网页并保存在文件中
	var start,end int
	fmt.Println("请输入要爬取的起始页(>=1)")
	fmt.Scan(&start)
	fmt.Println("请输入要爬取的起始页(>=start)")
	fmt.Scan(&end)
	fmt.Println("开始爬取")
	//写一个函数爬取网页
	scapy(start,end)
	return
}

func spiderhtml(i int,pagechan chan int){
	url:= "http://tieba.baidu.com/f?kw=%E9%87%8D%E5%BA%86%E5%A4%A7%E5%AD%A6&ie=utf-8&pn="+strconv.Itoa((i-1)*50)
	resp,err:= http.Get(url)
	errprint("http get err",err)

	defer resp.Body.Close()

	f,err2:=os.Create("第"+strconv.Itoa(i)+"个网页.html")
	errprint("os create err",err2)
	//读取文件
	buff := make([]byte,4096)
	for{
		n,err:=resp.Body.Read(buff)
		errprint("resp body err",err)
		if n == 0{
			break
		}
		//写出到文件
		f.Write(buff[:n])
	}
	f.Close()
	pagechan<-i
}



func scapy(start int, end int) {
	pagechan := make(chan int)
	fmt.Printf("正在爬取第%d页到%d页\n",start,end)
	for i:=start;i<=end ;i++  {
		go spiderhtml(i,pagechan)
	}

	for i:=start; i<=end;i++  {
		fmt.Printf("第%d个页面爬取完成\n",<-pagechan)
	}
}

func errprint(s string, e error) {
	if e!=nil&&e!=io.EOF{
		fmt.Println(s,e)
		return
	}
}

发布了74 篇原创文章 · 获赞 2 · 访问量 6469

猜你喜欢

转载自blog.csdn.net/weixin_42067668/article/details/103531876