使用Go爬豆瓣电影Top250 并写入数据库

这几天看了go语言,练习一下写法,结合项目会比较有趣,碰到的问题也会比较多。

Go爬虫

参考网站 【go语言爬虫】go语言爬取豆瓣电影top250

使用Go爬豆瓣电影Top250 并写入数据库

#####先看效果
image

准备工作

mysql(如未安装,也可以不要安装,直接看代码,打印控制台即可)
go 环境, 略,baidu并配置环境变量
开发工具 当前使用过的是LiteIDE,好像还有更好的编译工具

mysql

如何安装,略
数据库结构如下

DROP TABLE IF EXISTS `film`;
CREATE TABLE `film` (
  `id` int(255) NOT NULL AUTO_INCREMENT,
  `name` varchar(255) NOT NULL,
  `detail` varchar(255) DEFAULT NULL,
  `score` float DEFAULT '0',
  `commentCount` int(11) DEFAULT '0',
  `icon` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=175 DEFAULT CHARSET=utf8;
##清空id=0,学习的时候,经常删表再建..否则不需要
truncate table `film`;

当前代码链接数比较多… 需要修改mysql最大链接数

mysql -u root
show variables like "max_connections";
set GLOBAL max_connections=1000; 

go代码

main.go

// FilmProject project main.go
package main

import (
	"fmt"
	"time"
)

func main() {
	fmt.Println("Hello World!")
	getAllFilm()
	// getFirstFilmList()
	fmt.Println("end")
}

func getFirstFilmList() {
	list, err := GetFilm(true)
	if err != nil {
		fmt.Println("err  ")
	}
	fmt.Println(list)
	if list != nil && len(list) > 0 {
		for _, v := range list {
			DbInsert(v)
		}
	}
}

func getAllFilm() {
	count := 10

	list, err := GetFilm(true)
	if err != nil {
		fmt.Println("- --  ")
	}
	for i := 1; i < count; i++ {
		time.Sleep(2000)
		fmt.Println("开始第", i, "页")
		l, e := GetFilm(false)
		if e != nil {
			continue
		} else {
			list = append(list, l...)
		}

	}

	printFilmList(list)
	fmt.Println("size: ", len(list))
}

func printFilmList(list []FilmBean) {

	for _, bean := range list {
		bean.String()
		DbInsert(bean)
	}
}

RequestParseUtils.go
请求/解析film,并返回

// RequestParseUtils
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"strconv"
	//"strings"
)

var (
	forgeHeaders = map[string]string{

		"Host":                      "movie.douban.com",
		"Connection":                "keep-alive",
		"Cache-Control":             "max-age=0",
		"Upgrade-Insecure-Requests": "1",
		"User-Agent":                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
		"Accept":                    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
		"Referer":                   "https://movie.douban.com/top250",
	}

	// urlTitleRe = regexp.MustCompile(`img alt="(.*?)" src=`)
	// urlTitleRe = regexp.MustCompile(`<a href="(.*?)"> </a>`)
	// urlTitleRe = regexp.MustCompile(`<a href="(.*?)"> <img [^>]* alt="(.*?)" src="(.*?)" [^>]*></a>`)

	curPage = 0
	pageUrl = "https://movie.douban.com/top250?start=%d&filter="
)

/*主要用于请求,并解析,返回数据*/

//电影bean
type FilmBean struct {
	detail       string
	icon         string
	name         string
	score        float32
	commentCount int
}

func (f FilmBean) String() {
	fmt.Println("")
	fmt.Printf("name:%s,score %f,comment %d, detail: %s, icon: %s", f.name, f.score, f.commentCount, f.detail, f.icon)
}

func GetFilm(isFirst bool) (arr []FilmBean, err error) {
	if isFirst {
		curPage = 0
	} else {
		fmt.Print("第几页")
	}
	s := fmt.Sprintf(pageUrl, curPage*25)
	fmt.Printf("url: %s", s)
	resp, err := sendHttp(s)
	if err != nil {
		return nil, err
	}
	list := parseFilmFormResp(resp)
	curPage++
	return list, nil
}

//发送http请求
func sendHttp(url string) (resp string, err error) {
	client := http.Client{}
	req, err := http.NewRequest("GET", url, nil)
	fmt.Println("发送请求")
	if err != nil {
		fmt.Print("出错了")
		return "", err
	}
	for k, v := range forgeHeaders {
		req.Header.Add(k, v)
	}
	response, err2 := client.Do(req)
	if err2 != nil {
		return "", err2
	}
	defer response.Body.Close()
	//body := response.Body
	// fmt.Println(response)
	//fmt.Println(body)
	bytearr, err3 := ioutil.ReadAll(response.Body)
	if err3 != nil {
		return "", err3
	}
	str := string(bytearr)
	// fmt.Printf("result: %s", str)
	return str, nil
}

func parseFilmFormResp(resp string) (arr []FilmBean) {
	fmt.Println("parse-------------")
	pattern22 := `<a href="(.*?)">[^>]*<img [^>]* alt="(.*?)" src="(.*?)"[^>]*>[^>]*</a>`
	urlTitleRe := regexp.MustCompile(pattern22)
	detailTitle := urlTitleRe.FindAllStringSubmatch(resp, -1)
	// fmt.Println(resp)
	// fmt.Println(detailTitle)

	//评价人数
	commentPattern := `<span>(.*?)人评价</span>`
	commentRe := regexp.MustCompile(commentPattern)
	commentList := commentRe.FindAllStringSubmatch(resp, -1)
	//评分人数
	scorePattern := `property="v:average">(.*?)</span>`
	scoreRe := regexp.MustCompile(scorePattern)
	scoreList := scoreRe.FindAllStringSubmatch(resp, -1)

	var fmlist22 []FilmBean
	if detailTitle != nil && len(detailTitle) > 0 {
		length := len(detailTitle)
		// fmlist := [length]FilmBean{}
		fmlist22 = make([]FilmBean, length, length)
		item := detailTitle[0]
		fmt.Printf("url: %s, name:%s,  img:%s", item[1], item[2], item[3])

		fmt.Println("")
		for index, item := range detailTitle {
			fb := FilmBean{}
			fb.detail = item[1]
			fb.name = item[2]
			fb.icon = item[3]
			score1, _ := strconv.ParseFloat(scoreList[index][1], 32)
			fb.score = float32(score1)
			fb.commentCount, _ = strconv.Atoi(commentList[index][1])
			fmt.Printf("url: %s, name:%s,  img:%s", item[1], item[2], item[3])
			// fmt.Printf("url:%s ", item[0])
			fmt.Println("", index)
			fmlist22[index] = fb
		}

	}

	return fmlist22
}

FilmDb.go

// FilmDb
package main

import (
	"database/sql"
	"fmt"

	_ "github.com/go-sql-driver/mysql"
)

//插入demo
func DbInsert(f FilmBean) {
	fmt.Println("")
	db, err := sql.Open("mysql", "root:@/go_film?charset=utf8")
	checkErr(err)
	stmt, err := db.Prepare(`INSERT film (name,detail,score,commentCount,icon) values (?,?,?,?,?)`)
	checkErr(err)
	res, err := stmt.Exec(f.name, f.detail, f.score, f.commentCount, f.icon)
	checkErr(err)
	id, err := res.LastInsertId()
	checkErr(err)
	fmt.Println(id)
}
func checkErr(err error) {
	if err != nil {
		panic(err)
	}
}

注意:
如果接入mysql,把main里面 DbInsert(v) 、DbInsert(bean) 删除即可
如接入mysql, 注意上面的import _ “github.com/go-sql-driver/mysql
非windows系统,根据网上使用git安装mysql驱动

此处对windows 手动安装说明
参考 https://www.cnblogs.com/wangqishu/p/5147108.html

下载包
https://github.com/go-sql-driver/mysql/releases
当前最新版本,mysql-1.4.1
查看自己的gopath 如:D:\devTools\go
在path/src/ 创建 github.com\go-sql-driver\mysql 目录,对应import的结构;解压刚下载的zip 到该目录; (应该也可以直接吧mysql解压到src下,import改成mysql,不带前面路径)

例:D:\devTools\go\src\github.com\go-sql-driver\mysql

猜你喜欢

转载自blog.csdn.net/zgf1991/article/details/84316818
今日推荐