创建爬虫程序,爬取百度wow贴吧指定页面全部内容,保存到本地文件,单任务
源代码
package main
import (
"fmt"
"net/http"
"os"
"strconv"
)
func main() {
var start, end int
fmt.Println("请输入起始页(>=1):")
fmt.Scan(&start)
fmt.Println("请输入终止页(>=起始页):")
fmt.Scan(&end)
DoWork(start, end)
}
func DoWork(start, end int) {
fmt.Printf("正在爬取 %d 到 %d \n", start, end)
//明确目标,准备在哪个网站爬取
for i := start; i <= end; i++ {
url := "https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
fmt.Println("url = ", url)
//爬,把页面所有内容全部爬下来
result, err1 := HttpGet(url)
if err1 != nil {
fmt.Println("HttpGet err = ", err1)
continue
}
//把内容写入文件
fimeName := strconv.Itoa(i) + ".html"
file, err4 := os.Create(fimeName)
if err4 != nil {
fmt.Println("os.Create err4 = ", err4)
continue
}
file.WriteString(result)
file.Close()
}
}
//爬取网页内容
func HttpGet(url string) (result string, err error) {
resp, err2 := http.Get(url)
if err2 != nil {
err = err2
return
}
defer resp.Body.Close()
//读取网页body内容
buf := make([]byte, 4*1024)
for {
n, err3 := resp.Body.Read(buf)
if n == 0 {
//读取结束,或出问题
fmt.Println("resp.Body.Read err3 = ", err3)
break
}
result += string(buf[:n])
}
return
}
打印输出
请输入起始页(>=1):
1
请输入终止页(>=起始页):
3
正在爬取 1 到 3
url = https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=0
resp.Body.Read err3 = EOF
url = https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=50
resp.Body.Read err3 = EOF
url = https://tieba.baidu.com/f?kw=%E9%AD%94%E5%85%BD%E4%B8%96%E7%95%8C&ie=utf-8&pn=100
resp.Body.Read err3 = EOF