go爬虫实践 获取图片

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/RGBMarco/article/details/82190305

一般图片的位置都藏在html中img的src标签里,所有我们可以遍历html的img标签,获取并存取,我采用了goquery一个很不错的解析html的库
下面上代码,爬的的是新浪的,主线程sleep是为了防止goroutine泄漏,当然也可以通过通道避免

package main

import (
    "github.com/PuerkitoBio/goquery"
    "fmt"
    "os"
//  "log"
    "path"
    "strings"
    "regexp"
    "math/rand"
    "io"
    "net/http"
    "time"
)

func randFileName(dirPath string,suffix string) (fileName string) {
    var randChar string = "abcdefghimnjkqzxyt0123456789ioedkaldncalew0129387iue"
    var name string
    size := len(randChar)
    for i := 0; i < 12; i++ {
        name += string(randChar[rand.Intn(size)])
    }
    fileName = path.Join(dirPath,name)
    fileName += suffix
    _,err := os.Stat(fileName)
    if err != nil {
        if os.IsNotExist(err) {
            return fileName
        } else {
            return randFileName(dirPath,suffix)
        }
    }
    return randFileName(dirPath,suffix)
}

func writeImage(dirPath string,href string) {
    i := strings.LastIndex(href,".")
    suffix := href[i:]
    fileName := randFileName(dirPath,suffix)
    imageFile,err := os.Create(fileName)
    if err != nil {
        fmt.Printf("[writeImage create file]: fileName: %s\n href: %s\nerror: %s\n",fileName,href,err.Error())
        return
    }
    resp,err := http.Get(href)
    if err != nil {
        fmt.Printf("[writeImage http.Get: fileName]: %s\nhref: %s\n error: %s\n",fileName,href,err.Error())
        os.Remove(fileName)
        return
    }
    size,err := io.Copy(imageFile,resp.Body)
    if err != nil {
        fmt.Println("io.Copy:error: %s  href: %s\n",err.Error(),href)
        os.Remove(fileName)
    }
    fmt.Printf("Get From %s: %d bytes\n",href,size)
    return
}

func getCorrectHref(href string,url string) (correctHref string,exist bool){
    completeValidator := regexp.MustCompile(`\A((http)|(https))(.+)((.jpg)|(.png)|(.jpeg))\z`)
    imcompleteValidator := regexp.MustCompile(`\A(||)(.+)((.jpg)|(.jpeg)|(.png))\z`)
    if completeValidator.MatchString(href) {
        //fmt.Println(href)
        return href,true
    } else if imcompleteValidator.MatchString(href) {
        ret := href[1:]
        ret =  url + ret
        //fmt.Println(ret)
        return string(ret),true
    }
    return "",false
}

func main() {
    url := "http://sports.sina.com.cn"
    doc,err := goquery.NewDocument(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    pwd,err := os.Getwd()
    if err != nil {
        fmt.Println(err)
        return
    }
    dirPath := path.Join(pwd,"images")
    _,err = os.Stat(dirPath)
    if err != nil {
        if os.IsNotExist(err) {
            err = os.Mkdir(dirPath,os.ModePerm)
            if err != nil {
                fmt.Println(err)
                return
            }
        }
    }
    doc.Find("img").Each(func(i int,selection *goquery.Selection) {
            href,exist := selection.Attr("src")
            if exist {
                correctHref,exist := getCorrectHref(href,url)
                if exist {
                    go writeImage(dirPath,correctHref)
                }
            }
        })
    time.Sleep(time.Duration(12) * time.Second)
}

获取到的图片:

这里写图片描述

go!

这里写图片描述

猜你喜欢

转载自blog.csdn.net/RGBMarco/article/details/82190305