Capítulo Diez_Programación de red, 37_Programación de red_ 百度 贴 吧 攀 虫

Cree un programa de rastreo, rastree todo el contenido de la página designada de la barra de publicaciones de Baidu wow, guárdelo en un archivo local, tarea única

Código fuente

package main

import (
	"fmt"
	"net/http"
	"os"
	"strconv"
)

func main() {
    
    
	var start, end int
	fmt.Println("请输入起始页(>=1):")
	fmt.Scan(&start)
	fmt.Println("请输入终止页(>=起始页):")
	fmt.Scan(&end)

	DoWork(start, end)
}

func DoWork(start, end int) {
    
    
	fmt.Printf("正在爬取 %d 到 %d \n", start, end)

	//明确目标,准备在哪个网站爬取
	for i := start; i <= end; i++ {
    
    
		url := "https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
		fmt.Println("url = ", url)

		//爬,把页面所有内容全部爬下来
		result, err1 := HttpGet(url)
		if err1 != nil {
    
    
			fmt.Println("HttpGet err = ", err1)
			continue
		}

		//把内容写入文件
		fimeName := strconv.Itoa(i) + ".html"
		file, err4 := os.Create(fimeName)
		if err4 != nil {
    
    
			fmt.Println("os.Create err4 = ", err4)
			continue
		}
		file.WriteString(result)
		file.Close()
	}
}

//爬取网页内容
func HttpGet(url string) (result string, err error) {
    
    
	resp, err2 := http.Get(url)
	if err2 != nil {
    
    
		err = err2
		return
	}
	defer resp.Body.Close()

	//读取网页body内容
	buf := make([]byte, 4*1024)
	for {
    
    
		n, err3 := resp.Body.Read(buf)
		if n == 0 {
    
     //读取结束,或出问题
			fmt.Println("resp.Body.Read err3 = ", err3)
			break
		}
		result += string(buf[:n])
	}
	return
}

Imprimir

请输入起始页(>=1):
1
请输入终止页(>=起始页):
3
正在爬取 13
url =  https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=0
resp.Body.Read err3 =  EOF
url =  https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=50
resp.Body.Read err3 =  EOF
url =  https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=100
resp.Body.Read err3 =  EOF

Supongo que te gusta

Origin blog.csdn.net/weixin_40355471/article/details/115288499
Recomendado
Clasificación