第十章_网络编程,37_网络编程_百度贴吧爬虫

创建爬虫程序,爬取百度wow贴吧指定页面全部内容,保存到本地文件,单任务

源代码

package main

import (
	"fmt"
	"net/http"
	"os"
	"strconv"
)

func main() {
    
    
	var start, end int
	fmt.Println("请输入起始页(>=1):")
	fmt.Scan(&start)
	fmt.Println("请输入终止页(>=起始页):")
	fmt.Scan(&end)

	DoWork(start, end)
}

func DoWork(start, end int) {
    
    
	fmt.Printf("正在爬取 %d 到 %d \n", start, end)

	//明确目标,准备在哪个网站爬取
	for i := start; i <= end; i++ {
    
    
		url := "https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
		fmt.Println("url = ", url)

		//爬,把页面所有内容全部爬下来
		result, err1 := HttpGet(url)
		if err1 != nil {
    
    
			fmt.Println("HttpGet err = ", err1)
			continue
		}

		//把内容写入文件
		fimeName := strconv.Itoa(i) + ".html"
		file, err4 := os.Create(fimeName)
		if err4 != nil {
    
    
			fmt.Println("os.Create err4 = ", err4)
			continue
		}
		file.WriteString(result)
		file.Close()
	}
}

//爬取网页内容
func HttpGet(url string) (result string, err error) {
    
    
	resp, err2 := http.Get(url)
	if err2 != nil {
    
    
		err = err2
		return
	}
	defer resp.Body.Close()

	//读取网页body内容
	buf := make([]byte, 4*1024)
	for {
    
    
		n, err3 := resp.Body.Read(buf)
		if n == 0 {
    
     //读取结束,或出问题
			fmt.Println("resp.Body.Read err3 = ", err3)
			break
		}
		result += string(buf[:n])
	}
	return
}

打印输出

请输入起始页(>=1):
1
请输入终止页(>=起始页):
3
正在爬取 13
url =  https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=0
resp.Body.Read err3 =  EOF
url =  https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=50
resp.Body.Read err3 =  EOF
url =  https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=100
resp.Body.Read err3 =  EOF

猜你喜欢

转载自blog.csdn.net/weixin_40355471/article/details/115288499