go简单爬虫的实现

单线程版本

package main

import (
   "fmt"
   "strconv"
   "net/http"
   "os"
)

func pachong(start, end int) {
   //明确爬的地址
   url := "https://tieba.baidu.com/f?kw=%E6%AE%B5%E5%AD%90&ie=utf-8&pn="
   for i := start; i < end+1; i++ {
      fmt.Printf("当前爬取第 %d \n",i)
      res, err := curl(url + strconv.Itoa(i*50))
      if err != nil {
         fmt.Println("你有错误:", err)
      }
      name:=strconv.Itoa(i)+".html"
      //创建文件
      f,err1:=os.Create(name)
      if err1!=nil {
         fmt.Println("文件创建失败:",err1)
      }
      //内容写入文件
      f.WriteString(res)
      //关闭文件
      f.Close()
   }
}

//爬取网页内容
func curl(url string) (reques string, err error) {
   resq,err1:=http.Get(url)
   if err1 !=nil {
      err=err1
      return
   }
   //最后关闭
   defer resq.Body.Close()
   str:=make([]byte,1024*5)
   //循环读取数据
   for {
      number,_:=resq.Body.Read(str)
      if number == 0 {
         fmt.Println("爬取页面成功!")
         break
      }
      reques+=string(str[:number])
   }
   return
}

func main() {
   var start, end int
   //var c chan int
   fmt.Println("[请输入起始页面:]")
   //获取输入
   fmt.Scan(&start)
   fmt.Println("[请输入终止页面:]")
   fmt.Scan(&end)
   pachong(start, end)
}


并发版

package main

import (
   "fmt"
   "strconv"
   "net/http"
   "os"
)



func pachong(start, end int) {
   //明确爬的地址
   c:=make(chan int)
   url := "https://tieba.baidu.com/f?kw=%E6%AE%B5%E5%AD%90&ie=utf-8&pn="
   for i := start; i < end+1; i++ {
      go paing(i, url+strconv.Itoa(i*50),c)
   }
   for v:=range c{
      fmt.Printf("当前爬完第%d \n", v)
   }
}

func paing(i int, url string,c chan int) {
   fmt.Printf("当前爬取第 %d ,%s\n", i, url)
   res, err := curl(url)
   if err != nil {
      fmt.Println("你有错误:", err)
      return
   }
   name := strconv.Itoa(i) + ".html"
   //创建文件
   f, err1 := os.Create(name)
   if err1 != nil {
      fmt.Println("文件创建失败:", err1)
      return
   }
   //内容写入文件i
   f.WriteString(res)
   //关闭文件
   f.Close()
   c <- i
}

//爬取网页内容
func curl(url string) (reques string, err error) {
   resq, err1 := http.Get(url)
   if err1 != nil {
      err = err1
      return
   }
   //最后关闭
   defer resq.Body.Close()
   str := make([]byte, 1024*5)
   //循环读取数据
   for {
      number, _ := resq.Body.Read(str)
      if number == 0 {
         //fmt.Println("爬取页面成功!")
         break
      }
      reques += string(str[:number])
   }
   return
}

func main() {
   var start, end int
   fmt.Println("[请输入起始页面:]")
   //获取输入
   fmt.Scan(&start)
   fmt.Println("[请输入终止页面:]")
   fmt.Scan(&end)
   pachong(start, end)
}


pengfu爬虫

package main

import (
   "fmt"
   "strconv"
   "net/http"
   "regexp"
   "strings"
   "os"
)



func pachong(start, end int) {
   //明确爬的地址
   c:=make(chan int)
   for i := start; i < end+1; i++ {
      url := "https://www.pengfu.com/xiaohua_"+strconv.Itoa(i)+".html"
      go paing(i,url,c)
   }
   for v:=range c{
      fmt.Printf("当前爬完第%d \n", v)
   }
}

func paing(i int, url string,c chan int) {
   fmt.Printf("当前爬取第 %d ,%s\n", i, url)
   res, err := curl(url)
   if err != nil {
      fmt.Println("你有错误:", err)
      return
   }
   //aaa,_:=json.Marshal(res)
   //fmt.Println("json数据是:",string(aaa))
   //组装数据,写入txt   for k,v:=range res["title"]{
      //fmt.Println("标题是:",v[1])
      //a:=strings.Replace(res["content"][k][1], "\n", "", -1)
      //fmt.Println("内容是:",a)
       xie(v[1],res["content"][k][1])
   }
   c<-i
}

//写入内容
func xie(title,content string) {
   name := title + ".txt"
   //创建文件
   f, err1 := os.Create(name)
   if err1 != nil {
      fmt.Println("文件创建失败:", err1)
      return
   }
   //内容写入文件i
   f.WriteString(content)
   //关闭文件
   f.Close()
}

//爬取网页内容
func curl(url string) (res map[string][][]string,err error){
   res=make(map[string][][]string)
   resq, err := HttpGet(url)
   if err != nil {
      return res,err
   }
   // 去除换行符
   resq = strings.Replace(resq, "\t", "", -1)
   resq = strings.Replace(resq, "\r", "", -1)
   resq = strings.Replace(resq, "<br />", "", -1)
   //解析题目
   rel1:=regexp.MustCompile(`<div class='bdsharebuttonbox clearfix social_group' title="(.*)" humorId="(.*)"`)
   title:=rel1.FindAllStringSubmatch(resq, -1)
   //解析内容
   rel2:=regexp.MustCompile(`<div class="content-img clearfix pt10 relative">(?s:(.*?))</div>`)
   content:=rel2.FindAllStringSubmatch(resq, -1)
   res["title"]=title
   res["content"]=content
   return res,nil
}

//爬取网页内容
func HttpGet(url string) (result string, err error) {
   resq, err1 := http.Get(url)
   if err1 != nil {
      err = err1
      return
   }
   //最后关闭
   defer resq.Body.Close()
   buf := make([]byte, 1024*5)
   //循环读取数据
   for {
      number, _ := resq.Body.Read(buf)
      if number == 0 {
         //fmt.Println("爬取页面成功!")
         break
      }
      result += string(buf[:number])
   }
   return
}

func main() {
   var start, end int
   fmt.Println("[请输入起始页面:]")
   //获取输入
   fmt.Scan(&start)
   fmt.Println("[请输入终止页面:]")
   fmt.Scan(&end)
   pachong(start, end)
}


猜你喜欢

转载自blog.csdn.net/feiwutudou/article/details/80704279